Skip to content

Commit b76e883

Browse files
committed
improve MultiBandBlender cuda, add normalizeUsingWeight and addSrcWeight kernels
1 parent 642e4d9 commit b76e883

File tree

3 files changed

+288
-26
lines changed

3 files changed

+288
-26
lines changed

modules/stitching/include/opencv2/stitching/detail/blenders.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,10 @@ class CV_EXPORTS MultiBandBlender : public Blender
142142
Rect dst_roi_final_;
143143
bool can_use_gpu_;
144144
int weight_type_; //CV_32F or CV_16S
145+
#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
146+
std::vector<cuda::GpuMat> gpu_dst_pyr_laplace_;
147+
std::vector<cuda::GpuMat> gpu_dst_band_weights_;
148+
#endif
145149
};
146150

147151

modules/stitching/src/blenders.cpp

Lines changed: 172 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,23 @@
4343
#include "precomp.hpp"
4444
#include "opencl_kernels_stitching.hpp"
4545

46+
#ifdef HAVE_CUDA
47+
namespace cv { namespace cuda { namespace device
48+
{
49+
namespace blend
50+
{
51+
void addSrcWeightGpu16S(const PtrStep<short> src, const PtrStep<short> src_weight,
52+
PtrStep<short> dst, PtrStep<short> dst_weight, cv::Rect &rc);
53+
void addSrcWeightGpu32F(const PtrStep<short> src, const PtrStepf src_weight,
54+
PtrStep<short> dst, PtrStepf dst_weight, cv::Rect &rc);
55+
void normalizeUsingWeightMapGpu16S(const PtrStep<short> weight, PtrStep<short> src,
56+
const int width, const int height);
57+
void normalizeUsingWeightMapGpu32F(const PtrStepf weight, PtrStep<short> src,
58+
const int width, const int height);
59+
}
60+
}}}
61+
#endif
62+
4663
namespace cv {
4764
namespace detail {
4865

@@ -228,21 +245,46 @@ void MultiBandBlender::prepare(Rect dst_roi)
228245

229246
Blender::prepare(dst_roi);
230247

231-
dst_pyr_laplace_.resize(num_bands_ + 1);
232-
dst_pyr_laplace_[0] = dst_;
248+
#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
249+
if (can_use_gpu_)
250+
{
251+
gpu_dst_pyr_laplace_.resize(num_bands_ + 1);
252+
gpu_dst_pyr_laplace_[0].create(dst_roi.size(), CV_16SC3);
253+
gpu_dst_pyr_laplace_[0].setTo(Scalar::all(0));
233254

234-
dst_band_weights_.resize(num_bands_ + 1);
235-
dst_band_weights_[0].create(dst_roi.size(), weight_type_);
236-
dst_band_weights_[0].setTo(0);
255+
gpu_dst_band_weights_.resize(num_bands_ + 1);
256+
gpu_dst_band_weights_[0].create(dst_roi.size(), weight_type_);
257+
gpu_dst_band_weights_[0].setTo(0);
237258

238-
for (int i = 1; i <= num_bands_; ++i)
259+
for (int i = 1; i <= num_bands_; ++i)
260+
{
261+
gpu_dst_pyr_laplace_[i].create((gpu_dst_pyr_laplace_[i - 1].rows + 1) / 2,
262+
(gpu_dst_pyr_laplace_[i - 1].cols + 1) / 2, CV_16SC3);
263+
gpu_dst_band_weights_[i].create((gpu_dst_band_weights_[i - 1].rows + 1) / 2,
264+
(gpu_dst_band_weights_[i - 1].cols + 1) / 2, weight_type_);
265+
gpu_dst_pyr_laplace_[i].setTo(Scalar::all(0));
266+
gpu_dst_band_weights_[i].setTo(0);
267+
}
268+
}
269+
else
270+
#endif
239271
{
240-
dst_pyr_laplace_[i].create((dst_pyr_laplace_[i - 1].rows + 1) / 2,
241-
(dst_pyr_laplace_[i - 1].cols + 1) / 2, CV_16SC3);
242-
dst_band_weights_[i].create((dst_band_weights_[i - 1].rows + 1) / 2,
243-
(dst_band_weights_[i - 1].cols + 1) / 2, weight_type_);
244-
dst_pyr_laplace_[i].setTo(Scalar::all(0));
245-
dst_band_weights_[i].setTo(0);
272+
dst_pyr_laplace_.resize(num_bands_ + 1);
273+
dst_pyr_laplace_[0] = dst_;
274+
275+
dst_band_weights_.resize(num_bands_ + 1);
276+
dst_band_weights_[0].create(dst_roi.size(), weight_type_);
277+
dst_band_weights_[0].setTo(0);
278+
279+
for (int i = 1; i <= num_bands_; ++i)
280+
{
281+
dst_pyr_laplace_[i].create((dst_pyr_laplace_[i - 1].rows + 1) / 2,
282+
(dst_pyr_laplace_[i - 1].cols + 1) / 2, CV_16SC3);
283+
dst_band_weights_[i].create((dst_band_weights_[i - 1].rows + 1) / 2,
284+
(dst_band_weights_[i - 1].cols + 1) / 2, weight_type_);
285+
dst_pyr_laplace_[i].setTo(Scalar::all(0));
286+
dst_band_weights_[i].setTo(0);
287+
}
246288
}
247289
}
248290

@@ -312,6 +354,76 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
312354
int bottom = br_new.y - tl.y - img.rows;
313355
int right = br_new.x - tl.x - img.cols;
314356

357+
#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
358+
if (can_use_gpu_)
359+
{
360+
// Create the source image Laplacian pyramid
361+
cuda::GpuMat gpu_img;
362+
gpu_img.upload(img);
363+
cuda::GpuMat img_with_border;
364+
cuda::copyMakeBorder(gpu_img, img_with_border, top, bottom, left, right, BORDER_REFLECT);
365+
std::vector<cuda::GpuMat> gpu_src_pyr_laplace(num_bands_ + 1);
366+
img_with_border.convertTo(gpu_src_pyr_laplace[0], CV_16S);
367+
for (int i = 0; i < num_bands_; ++i)
368+
cuda::pyrDown(gpu_src_pyr_laplace[i], gpu_src_pyr_laplace[i + 1]);
369+
for (int i = 0; i < num_bands_; ++i)
370+
{
371+
cuda::GpuMat up;
372+
cuda::pyrUp(gpu_src_pyr_laplace[i + 1], up);
373+
cuda::subtract(gpu_src_pyr_laplace[i], up, gpu_src_pyr_laplace[i]);
374+
}
375+
376+
// Create the weight map Gaussian pyramid
377+
cuda::GpuMat gpu_mask;
378+
gpu_mask.upload(mask);
379+
cuda::GpuMat weight_map;
380+
std::vector<cuda::GpuMat> gpu_weight_pyr_gauss(num_bands_ + 1);
381+
382+
if (weight_type_ == CV_32F)
383+
{
384+
gpu_mask.convertTo(weight_map, CV_32F, 1. / 255.);
385+
}
386+
else // weight_type_ == CV_16S
387+
{
388+
gpu_mask.convertTo(weight_map, CV_16S);
389+
cuda::GpuMat add_mask;
390+
cuda::compare(gpu_mask, 0, add_mask, CMP_NE);
391+
cuda::add(weight_map, Scalar::all(1), weight_map, add_mask);
392+
}
393+
cuda::copyMakeBorder(weight_map, gpu_weight_pyr_gauss[0], top, bottom, left, right, BORDER_CONSTANT);
394+
for (int i = 0; i < num_bands_; ++i)
395+
cuda::pyrDown(gpu_weight_pyr_gauss[i], gpu_weight_pyr_gauss[i + 1]);
396+
397+
int y_tl = tl_new.y - dst_roi_.y;
398+
int y_br = br_new.y - dst_roi_.y;
399+
int x_tl = tl_new.x - dst_roi_.x;
400+
int x_br = br_new.x - dst_roi_.x;
401+
402+
// Add weighted layer of the source image to the final Laplacian pyramid layer
403+
for (int i = 0; i <= num_bands_; ++i)
404+
{
405+
Rect rc(x_tl, y_tl, x_br - x_tl, y_br - y_tl);
406+
cuda::GpuMat &_src_pyr_laplace = gpu_src_pyr_laplace[i];
407+
cuda::GpuMat _dst_pyr_laplace = gpu_dst_pyr_laplace_[i](rc);
408+
cuda::GpuMat &_weight_pyr_gauss = gpu_weight_pyr_gauss[i];
409+
cuda::GpuMat _dst_band_weights = gpu_dst_band_weights_[i](rc);
410+
411+
using namespace cv::cuda::device::blend;
412+
if (weight_type_ == CV_32F)
413+
{
414+
addSrcWeightGpu32F(_src_pyr_laplace, _weight_pyr_gauss, _dst_pyr_laplace, _dst_band_weights, rc);
415+
}
416+
else
417+
{
418+
addSrcWeightGpu16S(_src_pyr_laplace, _weight_pyr_gauss, _dst_pyr_laplace, _dst_band_weights, rc);
419+
}
420+
x_tl /= 2; y_tl /= 2;
421+
x_br /= 2; y_br /= 2;
422+
}
423+
return;
424+
}
425+
#endif
426+
315427
// Create the source image Laplacian pyramid
316428
UMat img_with_border;
317429
copyMakeBorder(_img, img_with_border, top, bottom, left, right,
@@ -322,10 +434,7 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
322434
#endif
323435

324436
std::vector<UMat> src_pyr_laplace;
325-
if (can_use_gpu_ && img_with_border.depth() == CV_16S)
326-
createLaplacePyrGpu(img_with_border, num_bands_, src_pyr_laplace);
327-
else
328-
createLaplacePyr(img_with_border, num_bands_, src_pyr_laplace);
437+
createLaplacePyr(img_with_border, num_bands_, src_pyr_laplace);
329438

330439
LOGLN(" Create the source image Laplacian pyramid, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
331440
#if ENABLE_LOG
@@ -431,20 +540,57 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
431540

432541
void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
433542
{
434-
for (int i = 0; i <= num_bands_; ++i)
435-
normalizeUsingWeightMap(dst_band_weights_[i], dst_pyr_laplace_[i]);
436-
543+
cv::UMat dst_band_weights_0;
544+
Rect dst_rc(0, 0, dst_roi_final_.width, dst_roi_final_.height);
545+
#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
437546
if (can_use_gpu_)
438-
restoreImageFromLaplacePyrGpu(dst_pyr_laplace_);
547+
{
548+
for (int i = 0; i <= num_bands_; ++i)
549+
{
550+
cuda::GpuMat dst_i = gpu_dst_pyr_laplace_[i];
551+
cuda::GpuMat weight_i = gpu_dst_band_weights_[i];
552+
553+
using namespace ::cv::cuda::device::blend;
554+
if (weight_type_ == CV_32F)
555+
{
556+
normalizeUsingWeightMapGpu32F(weight_i, dst_i, weight_i.cols, weight_i.rows);
557+
}
558+
else
559+
{
560+
normalizeUsingWeightMapGpu16S(weight_i, dst_i, weight_i.cols, weight_i.rows);
561+
}
562+
}
563+
564+
// Restore image from Laplacian pyramid
565+
for (size_t i = num_bands_; i > 0; --i)
566+
{
567+
cuda::GpuMat up;
568+
cuda::pyrUp(gpu_dst_pyr_laplace_[i], up);
569+
cuda::add(up, gpu_dst_pyr_laplace_[i - 1], gpu_dst_pyr_laplace_[i - 1]);
570+
}
571+
572+
gpu_dst_pyr_laplace_[0](dst_rc).download(dst_);
573+
gpu_dst_band_weights_[0].download(dst_band_weights_0);
574+
575+
gpu_dst_pyr_laplace_.clear();
576+
gpu_dst_band_weights_.clear();
577+
}
439578
else
579+
#endif
580+
{
581+
for (int i = 0; i <= num_bands_; ++i)
582+
normalizeUsingWeightMap(dst_band_weights_[i], dst_pyr_laplace_[i]);
583+
440584
restoreImageFromLaplacePyr(dst_pyr_laplace_);
441585

442-
Rect dst_rc(0, 0, dst_roi_final_.width, dst_roi_final_.height);
443-
dst_ = dst_pyr_laplace_[0](dst_rc);
444-
UMat _dst_mask;
445-
compare(dst_band_weights_[0](dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT);
446-
dst_pyr_laplace_.clear();
447-
dst_band_weights_.clear();
586+
dst_ = dst_pyr_laplace_[0](dst_rc);
587+
dst_band_weights_0 = dst_band_weights_[0];
588+
589+
dst_pyr_laplace_.clear();
590+
dst_band_weights_.clear();
591+
}
592+
593+
compare(dst_band_weights_0(dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT);
448594

449595
Blender::blend(dst, dst_mask);
450596
}
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#if !defined CUDA_DISABLER
2+
3+
#include "opencv2/core/cuda/common.hpp"
4+
#include "opencv2/core/types.hpp"
5+
6+
namespace cv { namespace cuda { namespace device
7+
{
8+
namespace blend
9+
{
10+
__global__ void addSrcWeightKernel16S(const PtrStep<short> src, const PtrStep<short> src_weight,
11+
PtrStep<short> dst, PtrStep<short> dst_weight, int rows, int cols)
12+
{
13+
int x = blockIdx.x * blockDim.x + threadIdx.x;
14+
int y = blockIdx.y * blockDim.y + threadIdx.y;
15+
16+
if (y < rows && x < cols)
17+
{
18+
const short3 v = ((const short3*)src.ptr(y))[x];
19+
short w = src_weight.ptr(y)[x];
20+
((short3*)dst.ptr(y))[x].x += short((v.x * w) >> 8);
21+
((short3*)dst.ptr(y))[x].y += short((v.y * w) >> 8);
22+
((short3*)dst.ptr(y))[x].z += short((v.z * w) >> 8);
23+
dst_weight.ptr(y)[x] += w;
24+
}
25+
}
26+
27+
void addSrcWeightGpu16S(const PtrStep<short> src, const PtrStep<short> src_weight,
28+
PtrStep<short> dst, PtrStep<short> dst_weight, cv::Rect &rc)
29+
{
30+
dim3 threads(16, 16);
31+
dim3 grid(divUp(rc.width, threads.x), divUp(rc.height, threads.y));
32+
addSrcWeightKernel16S<<<grid, threads>>>(src, src_weight, dst, dst_weight, rc.height, rc.width);
33+
cudaSafeCall(cudaGetLastError());
34+
}
35+
36+
__global__ void addSrcWeightKernel32F(const PtrStep<short> src, const PtrStepf src_weight,
37+
PtrStep<short> dst, PtrStepf dst_weight, int rows, int cols)
38+
{
39+
int x = blockIdx.x * blockDim.x + threadIdx.x;
40+
int y = blockIdx.y * blockDim.y + threadIdx.y;
41+
42+
if (y < rows && x < cols)
43+
{
44+
const short3 v = ((const short3*)src.ptr(y))[x];
45+
float w = src_weight.ptr(y)[x];
46+
((short3*)dst.ptr(y))[x].x += static_cast<short>(v.x * w);
47+
((short3*)dst.ptr(y))[x].y += static_cast<short>(v.y * w);
48+
((short3*)dst.ptr(y))[x].z += static_cast<short>(v.z * w);
49+
dst_weight.ptr(y)[x] += w;
50+
}
51+
}
52+
53+
void addSrcWeightGpu32F(const PtrStep<short> src, const PtrStepf src_weight,
54+
PtrStep<short> dst, PtrStepf dst_weight, cv::Rect &rc)
55+
{
56+
dim3 threads(16, 16);
57+
dim3 grid(divUp(rc.width, threads.x), divUp(rc.height, threads.y));
58+
addSrcWeightKernel32F<<<grid, threads>>>(src, src_weight, dst, dst_weight, rc.height, rc.width);
59+
cudaSafeCall(cudaGetLastError());
60+
}
61+
62+
__global__ void normalizeUsingWeightKernel16S(const PtrStep<short> weight, PtrStep<short> src,
63+
const int width, const int height)
64+
{
65+
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
66+
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
67+
68+
if (x < width && y < height)
69+
{
70+
const short3 v = ((short3*)src.ptr(y))[x];
71+
short w = weight.ptr(y)[x];
72+
((short3*)src.ptr(y))[x] = make_short3(short((v.x << 8) / w),
73+
short((v.y << 8) / w), short((v.z << 8) / w));
74+
}
75+
}
76+
77+
void normalizeUsingWeightMapGpu16S(const PtrStep<short> weight, PtrStep<short> src,
78+
const int width, const int height)
79+
{
80+
dim3 threads(16, 16);
81+
dim3 grid(divUp(width, threads.x), divUp(height, threads.y));
82+
normalizeUsingWeightKernel16S<<<grid, threads>>> (weight, src, width, height);
83+
}
84+
85+
__global__ void normalizeUsingWeightKernel32F(const PtrStepf weight, PtrStep<short> src,
86+
const int width, const int height)
87+
{
88+
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
89+
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
90+
91+
if (x < width && y < height)
92+
{
93+
static const float WEIGHT_EPS = 1e-5f;
94+
const short3 v = ((short3*)src.ptr(y))[x];
95+
float w = weight.ptr(y)[x];
96+
((short3*)src.ptr(y))[x] = make_short3(static_cast<short>(v.x / (w + WEIGHT_EPS)),
97+
static_cast<short>(v.y / (w + WEIGHT_EPS)),
98+
static_cast<short>(v.z / (w + WEIGHT_EPS)));
99+
}
100+
}
101+
102+
void normalizeUsingWeightMapGpu32F(const PtrStepf weight, PtrStep<short> src,
103+
const int width, const int height)
104+
{
105+
dim3 threads(16, 16);
106+
dim3 grid(divUp(width, threads.x), divUp(height, threads.y));
107+
normalizeUsingWeightKernel32F<<<grid, threads>>> (weight, src, width, height);
108+
}
109+
}
110+
}}}
111+
112+
#endif

0 commit comments

Comments
 (0)