improve MultiBandBlender cuda, add normalizeUsingWeight and addSrcWeight kernels

wenju-he · wenju-he · commit b76e88354ca6 · 2017-02-12T17:08:05.000+08:00
diff --git a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
@@ -142,6 +142,10 @@ class CV_EXPORTS MultiBandBlender : public Blender
     Rect dst_roi_final_;
     bool can_use_gpu_;
     int weight_type_; //CV_32F or CV_16S
+#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
+    std::vector<cuda::GpuMat> gpu_dst_pyr_laplace_;
+    std::vector<cuda::GpuMat> gpu_dst_band_weights_;
+#endif
 };
 
 
diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp
@@ -43,6 +43,23 @@
 #include "precomp.hpp"
 #include "opencl_kernels_stitching.hpp"
 
+#ifdef HAVE_CUDA
+    namespace cv { namespace cuda { namespace device
+    {
+        namespace blend
+        {
+            void addSrcWeightGpu16S(const PtrStep<short> src, const PtrStep<short> src_weight,
+                                    PtrStep<short> dst, PtrStep<short> dst_weight, cv::Rect &rc);
+            void addSrcWeightGpu32F(const PtrStep<short> src, const PtrStepf src_weight,
+                                    PtrStep<short> dst, PtrStepf dst_weight, cv::Rect &rc);
+            void normalizeUsingWeightMapGpu16S(const PtrStep<short> weight, PtrStep<short> src,
+                                               const int width, const int height);
+            void normalizeUsingWeightMapGpu32F(const PtrStepf weight, PtrStep<short> src,
+                                               const int width, const int height);
+        }
+    }}}
+#endif
+
 namespace cv {
 namespace detail {
 
@@ -228,21 +245,46 @@ void MultiBandBlender::prepare(Rect dst_roi)
 
     Blender::prepare(dst_roi);
 
-    dst_pyr_laplace_.resize(num_bands_ + 1);
-    dst_pyr_laplace_[0] = dst_;
+#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
+    if (can_use_gpu_)
+    {
+        gpu_dst_pyr_laplace_.resize(num_bands_ + 1);
+        gpu_dst_pyr_laplace_[0].create(dst_roi.size(), CV_16SC3);
+        gpu_dst_pyr_laplace_[0].setTo(Scalar::all(0));
 
-    dst_band_weights_.resize(num_bands_ + 1);
-    dst_band_weights_[0].create(dst_roi.size(), weight_type_);
-    dst_band_weights_[0].setTo(0);
+        gpu_dst_band_weights_.resize(num_bands_ + 1);
+        gpu_dst_band_weights_[0].create(dst_roi.size(), weight_type_);
+        gpu_dst_band_weights_[0].setTo(0);
 
-    for (int i = 1; i <= num_bands_; ++i)
+        for (int i = 1; i <= num_bands_; ++i)
+        {
+            gpu_dst_pyr_laplace_[i].create((gpu_dst_pyr_laplace_[i - 1].rows + 1) / 2,
+                (gpu_dst_pyr_laplace_[i - 1].cols + 1) / 2, CV_16SC3);
+            gpu_dst_band_weights_[i].create((gpu_dst_band_weights_[i - 1].rows + 1) / 2,
+                (gpu_dst_band_weights_[i - 1].cols + 1) / 2, weight_type_);
+            gpu_dst_pyr_laplace_[i].setTo(Scalar::all(0));
+            gpu_dst_band_weights_[i].setTo(0);
+        }
+    }
+    else
+#endif
     {
-        dst_pyr_laplace_[i].create((dst_pyr_laplace_[i - 1].rows + 1) / 2,
-                                   (dst_pyr_laplace_[i - 1].cols + 1) / 2, CV_16SC3);
-        dst_band_weights_[i].create((dst_band_weights_[i - 1].rows + 1) / 2,
-                                    (dst_band_weights_[i - 1].cols + 1) / 2, weight_type_);
-        dst_pyr_laplace_[i].setTo(Scalar::all(0));
-        dst_band_weights_[i].setTo(0);
+        dst_pyr_laplace_.resize(num_bands_ + 1);
+        dst_pyr_laplace_[0] = dst_;
+
+        dst_band_weights_.resize(num_bands_ + 1);
+        dst_band_weights_[0].create(dst_roi.size(), weight_type_);
+        dst_band_weights_[0].setTo(0);
+
+        for (int i = 1; i <= num_bands_; ++i)
+        {
+            dst_pyr_laplace_[i].create((dst_pyr_laplace_[i - 1].rows + 1) / 2,
+                (dst_pyr_laplace_[i - 1].cols + 1) / 2, CV_16SC3);
+            dst_band_weights_[i].create((dst_band_weights_[i - 1].rows + 1) / 2,
+                (dst_band_weights_[i - 1].cols + 1) / 2, weight_type_);
+            dst_pyr_laplace_[i].setTo(Scalar::all(0));
+            dst_band_weights_[i].setTo(0);
+        }
     }
 }
 
@@ -312,6 +354,76 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
     int bottom = br_new.y - tl.y - img.rows;
     int right = br_new.x - tl.x - img.cols;
 
+#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
+    if (can_use_gpu_)
+    {
+        // Create the source image Laplacian pyramid
+        cuda::GpuMat gpu_img;
+        gpu_img.upload(img);
+        cuda::GpuMat img_with_border;
+        cuda::copyMakeBorder(gpu_img, img_with_border, top, bottom, left, right, BORDER_REFLECT);
+        std::vector<cuda::GpuMat> gpu_src_pyr_laplace(num_bands_ + 1);
+        img_with_border.convertTo(gpu_src_pyr_laplace[0], CV_16S);
+        for (int i = 0; i < num_bands_; ++i)
+            cuda::pyrDown(gpu_src_pyr_laplace[i], gpu_src_pyr_laplace[i + 1]);
+        for (int i = 0; i < num_bands_; ++i)
+        {
+            cuda::GpuMat up;
+            cuda::pyrUp(gpu_src_pyr_laplace[i + 1], up);
+            cuda::subtract(gpu_src_pyr_laplace[i], up, gpu_src_pyr_laplace[i]);
+        }
+
+        // Create the weight map Gaussian pyramid
+        cuda::GpuMat gpu_mask;
+        gpu_mask.upload(mask);
+        cuda::GpuMat weight_map;
+        std::vector<cuda::GpuMat> gpu_weight_pyr_gauss(num_bands_ + 1);
+
+        if (weight_type_ == CV_32F)
+        {
+            gpu_mask.convertTo(weight_map, CV_32F, 1. / 255.);
+        }
+        else // weight_type_ == CV_16S
+        {
+            gpu_mask.convertTo(weight_map, CV_16S);
+            cuda::GpuMat add_mask;
+            cuda::compare(gpu_mask, 0, add_mask, CMP_NE);
+            cuda::add(weight_map, Scalar::all(1), weight_map, add_mask);
+        }
+        cuda::copyMakeBorder(weight_map, gpu_weight_pyr_gauss[0], top, bottom, left, right, BORDER_CONSTANT);
+        for (int i = 0; i < num_bands_; ++i)
+            cuda::pyrDown(gpu_weight_pyr_gauss[i], gpu_weight_pyr_gauss[i + 1]);
+
+        int y_tl = tl_new.y - dst_roi_.y;
+        int y_br = br_new.y - dst_roi_.y;
+        int x_tl = tl_new.x - dst_roi_.x;
+        int x_br = br_new.x - dst_roi_.x;
+
+        // Add weighted layer of the source image to the final Laplacian pyramid layer
+        for (int i = 0; i <= num_bands_; ++i)
+        {
+            Rect rc(x_tl, y_tl, x_br - x_tl, y_br - y_tl);
+            cuda::GpuMat &_src_pyr_laplace = gpu_src_pyr_laplace[i];
+            cuda::GpuMat _dst_pyr_laplace = gpu_dst_pyr_laplace_[i](rc);
+            cuda::GpuMat &_weight_pyr_gauss = gpu_weight_pyr_gauss[i];
+            cuda::GpuMat _dst_band_weights = gpu_dst_band_weights_[i](rc);
+
+            using namespace cv::cuda::device::blend;
+            if (weight_type_ == CV_32F)
+            {
+                addSrcWeightGpu32F(_src_pyr_laplace, _weight_pyr_gauss, _dst_pyr_laplace, _dst_band_weights, rc);
+            }
+            else
+            {
+                addSrcWeightGpu16S(_src_pyr_laplace, _weight_pyr_gauss, _dst_pyr_laplace, _dst_band_weights, rc);
+            }
+            x_tl /= 2; y_tl /= 2;
+            x_br /= 2; y_br /= 2;
+        }
+        return;
+    }
+#endif
+
     // Create the source image Laplacian pyramid
     UMat img_with_border;
     copyMakeBorder(_img, img_with_border, top, bottom, left, right,
@@ -322,10 +434,7 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
 #endif
 
     std::vector<UMat> src_pyr_laplace;
-    if (can_use_gpu_ && img_with_border.depth() == CV_16S)
-        createLaplacePyrGpu(img_with_border, num_bands_, src_pyr_laplace);
-    else
-        createLaplacePyr(img_with_border, num_bands_, src_pyr_laplace);
+    createLaplacePyr(img_with_border, num_bands_, src_pyr_laplace);
 
     LOGLN("  Create the source image Laplacian pyramid, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
 #if ENABLE_LOG
@@ -431,20 +540,57 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
 
 void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
 {
-    for (int i = 0; i <= num_bands_; ++i)
-        normalizeUsingWeightMap(dst_band_weights_[i], dst_pyr_laplace_[i]);
-
+    cv::UMat dst_band_weights_0;
+    Rect dst_rc(0, 0, dst_roi_final_.width, dst_roi_final_.height);
+#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
     if (can_use_gpu_)
-        restoreImageFromLaplacePyrGpu(dst_pyr_laplace_);
+    {
+        for (int i = 0; i <= num_bands_; ++i)
+        {
+            cuda::GpuMat dst_i = gpu_dst_pyr_laplace_[i];
+            cuda::GpuMat weight_i = gpu_dst_band_weights_[i];
+
+            using namespace ::cv::cuda::device::blend;
+            if (weight_type_ == CV_32F)
+            {
+                normalizeUsingWeightMapGpu32F(weight_i, dst_i, weight_i.cols, weight_i.rows);
+            }
+            else
+            {
+                normalizeUsingWeightMapGpu16S(weight_i, dst_i, weight_i.cols, weight_i.rows);
+            }
+        }
+
+        // Restore image from Laplacian pyramid
+        for (size_t i = num_bands_; i > 0; --i)
+        {
+            cuda::GpuMat up;
+            cuda::pyrUp(gpu_dst_pyr_laplace_[i], up);
+            cuda::add(up, gpu_dst_pyr_laplace_[i - 1], gpu_dst_pyr_laplace_[i - 1]);
+        }
+
+        gpu_dst_pyr_laplace_[0](dst_rc).download(dst_);
+        gpu_dst_band_weights_[0].download(dst_band_weights_0);
+
+        gpu_dst_pyr_laplace_.clear();
+        gpu_dst_band_weights_.clear();
+    }
     else
+#endif
+    {
+        for (int i = 0; i <= num_bands_; ++i)
+            normalizeUsingWeightMap(dst_band_weights_[i], dst_pyr_laplace_[i]);
+
         restoreImageFromLaplacePyr(dst_pyr_laplace_);
 
-    Rect dst_rc(0, 0, dst_roi_final_.width, dst_roi_final_.height);
-    dst_ = dst_pyr_laplace_[0](dst_rc);
-    UMat _dst_mask;
-    compare(dst_band_weights_[0](dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT);
-    dst_pyr_laplace_.clear();
-    dst_band_weights_.clear();
+        dst_ = dst_pyr_laplace_[0](dst_rc);
+        dst_band_weights_0 = dst_band_weights_[0];
+
+        dst_pyr_laplace_.clear();
+        dst_band_weights_.clear();
+    }
+
+    compare(dst_band_weights_0(dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT);
 
     Blender::blend(dst, dst_mask);
 }
diff --git a/modules/stitching/src/cuda/multiband_blend.cu b/modules/stitching/src/cuda/multiband_blend.cu
@@ -0,0 +1,112 @@
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/types.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace blend
+    {
+        __global__ void addSrcWeightKernel16S(const PtrStep<short> src, const PtrStep<short> src_weight,
+            PtrStep<short> dst, PtrStep<short> dst_weight, int rows, int cols)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                const short3 v = ((const short3*)src.ptr(y))[x];
+                short w = src_weight.ptr(y)[x];
+                ((short3*)dst.ptr(y))[x].x += short((v.x * w) >> 8);
+                ((short3*)dst.ptr(y))[x].y += short((v.y * w) >> 8);
+                ((short3*)dst.ptr(y))[x].z += short((v.z * w) >> 8);
+                dst_weight.ptr(y)[x] += w;
+            }
+        }
+
+        void addSrcWeightGpu16S(const PtrStep<short> src, const PtrStep<short> src_weight,
+            PtrStep<short> dst, PtrStep<short> dst_weight, cv::Rect &rc)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(rc.width, threads.x), divUp(rc.height, threads.y));
+            addSrcWeightKernel16S<<<grid, threads>>>(src, src_weight, dst, dst_weight, rc.height, rc.width);
+            cudaSafeCall(cudaGetLastError());
+        }
+
+        __global__ void addSrcWeightKernel32F(const PtrStep<short> src, const PtrStepf src_weight,
+            PtrStep<short> dst, PtrStepf dst_weight, int rows, int cols)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                const short3 v = ((const short3*)src.ptr(y))[x];
+                float w = src_weight.ptr(y)[x];
+                ((short3*)dst.ptr(y))[x].x += static_cast<short>(v.x * w);
+                ((short3*)dst.ptr(y))[x].y += static_cast<short>(v.y * w);
+                ((short3*)dst.ptr(y))[x].z += static_cast<short>(v.z * w);
+                dst_weight.ptr(y)[x] += w;
+            }
+        }
+
+        void addSrcWeightGpu32F(const PtrStep<short> src, const PtrStepf src_weight,
+            PtrStep<short> dst, PtrStepf dst_weight, cv::Rect &rc)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(rc.width, threads.x), divUp(rc.height, threads.y));
+            addSrcWeightKernel32F<<<grid, threads>>>(src, src_weight, dst, dst_weight, rc.height, rc.width);
+            cudaSafeCall(cudaGetLastError());
+        }
+
+        __global__ void normalizeUsingWeightKernel16S(const PtrStep<short> weight, PtrStep<short> src,
+            const int width, const int height)
+        {
+            int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+            int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+            if (x < width && y < height)
+            {
+                const short3 v = ((short3*)src.ptr(y))[x];
+                short w = weight.ptr(y)[x];
+                ((short3*)src.ptr(y))[x] = make_short3(short((v.x << 8) / w),
+                    short((v.y << 8) / w), short((v.z << 8) / w));
+            }
+        }
+
+        void normalizeUsingWeightMapGpu16S(const PtrStep<short> weight, PtrStep<short> src,
+                                           const int width, const int height)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(width, threads.x), divUp(height, threads.y));
+            normalizeUsingWeightKernel16S<<<grid, threads>>> (weight, src, width, height);
+        }
+
+        __global__ void normalizeUsingWeightKernel32F(const PtrStepf weight, PtrStep<short> src,
+            const int width, const int height)
+        {
+            int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+            int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+            if (x < width && y < height)
+            {
+                static const float WEIGHT_EPS = 1e-5f;
+                const short3 v = ((short3*)src.ptr(y))[x];
+                float w = weight.ptr(y)[x];
+                ((short3*)src.ptr(y))[x] = make_short3(static_cast<short>(v.x / (w + WEIGHT_EPS)),
+                    static_cast<short>(v.y / (w + WEIGHT_EPS)),
+                    static_cast<short>(v.z / (w + WEIGHT_EPS)));
+            }
+        }
+
+        void normalizeUsingWeightMapGpu32F(const PtrStepf weight, PtrStep<short> src,
+                                           const int width, const int height)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(width, threads.x), divUp(height, threads.y));
+            normalizeUsingWeightKernel32F<<<grid, threads>>> (weight, src, width, height);
+        }
+    }
+}}}
+
+#endif