Skip to content

Commit 910d7da

Browse files
committed
prior box layer ocl implementation
Signed-off-by: Li Peng <peng.li@intel.com>
1 parent cac4a7e commit 910d7da

File tree

2 files changed

+254
-0
lines changed

2 files changed

+254
-0
lines changed

modules/dnn/src/layers/prior_box_layer.cpp

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include <float.h>
4646
#include <algorithm>
4747
#include <cmath>
48+
#include "opencl_kernels_dnn.hpp"
4849

4950
namespace cv
5051
{
@@ -270,11 +271,108 @@ class PriorBoxLayerImpl : public PriorBoxLayer
270271
return false;
271272
}
272273

274+
#ifdef HAVE_OPENCL
275+
bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
276+
{
277+
std::vector<UMat> inputs;
278+
std::vector<UMat> outputs;
279+
280+
inps.getUMatVector(inputs);
281+
outs.getUMatVector(outputs);
282+
283+
int _layerWidth = inputs[0].size[3];
284+
int _layerHeight = inputs[0].size[2];
285+
286+
int _imageWidth = inputs[1].size[3];
287+
int _imageHeight = inputs[1].size[2];
288+
289+
float stepX, stepY;
290+
if (_stepX == 0 || _stepY == 0)
291+
{
292+
stepX = static_cast<float>(_imageWidth) / _layerWidth;
293+
stepY = static_cast<float>(_imageHeight) / _layerHeight;
294+
} else {
295+
stepX = _stepX;
296+
stepY = _stepY;
297+
}
298+
299+
if (umat_offsetsX.empty())
300+
{
301+
Mat offsetsX(1, _offsetsX.size(), CV_32FC1, &_offsetsX[0]);
302+
Mat offsetsY(1, _offsetsX.size(), CV_32FC1, &_offsetsY[0]);
303+
Mat aspectRatios(1, _aspectRatios.size(), CV_32FC1, &_aspectRatios[0]);
304+
Mat variance(1, _variance.size(), CV_32FC1, &_variance[0]);
305+
306+
offsetsX.copyTo(umat_offsetsX);
307+
offsetsY.copyTo(umat_offsetsY);
308+
aspectRatios.copyTo(umat_aspectRatios);
309+
variance.copyTo(umat_variance);
310+
311+
int real_numPriors = _numPriors / pow(2, _offsetsX.size() - 1);
312+
umat_scales = UMat(1, &real_numPriors, CV_32F, 1.0f);
313+
}
314+
315+
size_t nthreads = _layerHeight * _layerWidth;
316+
317+
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
318+
kernel.set(0, (int)nthreads);
319+
kernel.set(1, (float)stepX);
320+
kernel.set(2, (float)stepY);
321+
kernel.set(3, (float)_minSize);
322+
kernel.set(4, (float)_maxSize);
323+
kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_offsetsX));
324+
kernel.set(6, ocl::KernelArg::PtrReadOnly(umat_offsetsY));
325+
kernel.set(7, (int)_offsetsX.size());
326+
kernel.set(8, ocl::KernelArg::PtrReadOnly(umat_aspectRatios));
327+
kernel.set(9, (int)_aspectRatios.size());
328+
kernel.set(10, ocl::KernelArg::PtrReadOnly(umat_scales));
329+
kernel.set(11, ocl::KernelArg::PtrWriteOnly(outputs[0]));
330+
kernel.set(12, (int)_layerHeight);
331+
kernel.set(13, (int)_layerWidth);
332+
kernel.set(14, (int)_imageHeight);
333+
kernel.set(15, (int)_imageWidth);
334+
kernel.run(1, &nthreads, NULL, false);
335+
336+
// clip the prior's coordidate such that it is within [0, 1]
337+
if (_clip)
338+
{
339+
Mat mat = outputs[0].getMat(ACCESS_READ);
340+
int aspect_count = (_maxSize > 0) ? 1 : 0;
341+
int offset = nthreads * 4 * _offsetsX.size() * (1 + aspect_count + _aspectRatios.size());
342+
float* outputPtr = mat.ptr<float>() + offset;
343+
int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
344+
for (size_t d = 0; d < _outChannelSize; ++d)
345+
{
346+
outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
347+
}
348+
}
349+
350+
// set the variance.
351+
{
352+
ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc);
353+
int offset = total(shape(outputs[0]), 2);
354+
size_t nthreads = _layerHeight * _layerWidth * _numPriors;
355+
kernel.set(0, (int)nthreads);
356+
kernel.set(1, (int)offset);
357+
kernel.set(2, (int)_variance.size());
358+
kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_variance));
359+
kernel.set(4, ocl::KernelArg::PtrWriteOnly(outputs[0]));
360+
if (!kernel.run(1, &nthreads, NULL, false))
361+
return false;
362+
}
363+
return true;
364+
}
365+
#endif
366+
273367
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
274368
{
275369
CV_TRACE_FUNCTION();
276370
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
277371

372+
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
373+
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
374+
forward_ocl(inputs_arr, outputs_arr, internals_arr))
375+
278376
Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
279377
}
280378

@@ -441,6 +539,14 @@ class PriorBoxLayerImpl : public PriorBoxLayer
441539
std::vector<float> _offsetsX;
442540
std::vector<float> _offsetsY;
443541

542+
#ifdef HAVE_OPENCL
543+
UMat umat_offsetsX;
544+
UMat umat_offsetsY;
545+
UMat umat_aspectRatios;
546+
UMat umat_scales;
547+
UMat umat_variance;
548+
#endif
549+
444550
bool _flip;
445551
bool _clip;
446552
bool _explicitSizes;

modules/dnn/src/opencl/prior_box.cl

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/*M///////////////////////////////////////////////////////////////////////////////////////
2+
//
3+
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4+
//
5+
// By downloading, copying, installing or using the software you agree to this license.
6+
// If you do not agree to this license, do not download, install,
7+
// copy or use the software.
8+
//
9+
//
10+
// License Agreement
11+
// For Open Source Computer Vision Library
12+
//
13+
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
14+
// Third party copyrights are property of their respective owners.
15+
//
16+
// Redistribution and use in source and binary forms, with or without modification,
17+
// are permitted provided that the following conditions are met:
18+
//
19+
// * Redistribution's of source code must retain the above copyright notice,
20+
// this list of conditions and the following disclaimer.
21+
//
22+
// * Redistribution's in binary form must reproduce the above copyright notice,
23+
// this list of conditions and the following disclaimer in the documentation
24+
// and/or other materials provided with the distribution.
25+
//
26+
// * The name of the copyright holders may not be used to endorse or promote products
27+
// derived from this software without specific prior written permission.
28+
//
29+
// This software is provided by the copyright holders and contributors "as is" and
30+
// any express or implied warranties, including, but not limited to, the implied
31+
// warranties of merchantability and fitness for a particular purpose are disclaimed.
32+
// In no event shall the Intel Corporation or contributors be liable for any direct,
33+
// indirect, incidental, special, exemplary, or consequential damages
34+
// (including, but not limited to, procurement of substitute goods or services;
35+
// loss of use, data, or profits; or business interruption) however caused
36+
// and on any theory of liability, whether in contract, strict liability,
37+
// or tort (including negligence or otherwise) arising in any way out of
38+
// the use of this software, even if advised of the possibility of such damage.
39+
//
40+
//M*/
41+
42+
#define Dtype float
43+
#define Dtype4 float4
44+
45+
__kernel void prior_box(const int nthreads,
46+
const Dtype stepX,
47+
const Dtype stepY,
48+
const Dtype _minSize,
49+
const Dtype _maxSize,
50+
__global const Dtype* _offsetsX,
51+
__global const Dtype* _offsetsY,
52+
const int offsetsX_size,
53+
__global const Dtype* _aspectRatios,
54+
const int aspectRatios_size,
55+
__global const Dtype* scales,
56+
__global Dtype* dst,
57+
const int _layerHeight,
58+
const int _layerWidth,
59+
const int imgHeight,
60+
const int imgWidth)
61+
{
62+
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
63+
{
64+
int w = index % _layerWidth;
65+
int h = index / _layerWidth;
66+
__global Dtype* outputPtr;
67+
int aspect_count = (_maxSize > 0) ? 1 : 0;
68+
outputPtr = dst + index * 4 * offsetsX_size * (1 + aspect_count + aspectRatios_size);
69+
70+
Dtype _boxWidth, _boxHeight;
71+
Dtype4 vec;
72+
_boxWidth = _boxHeight = _minSize * scales[0];
73+
for (int i = 0; i < offsetsX_size; ++i)
74+
{
75+
float center_x = (w + _offsetsX[i]) * stepX;
76+
float center_y = (h + _offsetsY[i]) * stepY;
77+
78+
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
79+
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
80+
vec.z = (center_x + _boxWidth * 0.5f) / imgWidth; // xmax
81+
vec.w = (center_y + _boxHeight * 0.5f) / imgHeight; // ymax
82+
vstore4(vec, 0, outputPtr);
83+
84+
outputPtr += 4;
85+
}
86+
87+
if (_maxSize > 0)
88+
{
89+
_boxWidth = _boxHeight = native_sqrt(_minSize * _maxSize) * scales[1];
90+
91+
for (int i = 0; i < offsetsX_size; ++i)
92+
{
93+
float center_x = (w + _offsetsX[i]) * stepX;
94+
float center_y = (h + _offsetsY[i]) * stepY;
95+
96+
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
97+
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
98+
vec.z = (center_x + _boxWidth * 0.5f) / imgWidth; // xmax
99+
vec.w = (center_y + _boxHeight * 0.5f) / imgHeight; // ymax
100+
vstore4(vec, 0, outputPtr);
101+
102+
outputPtr += 4;
103+
}
104+
}
105+
106+
for (int r = 0; r < aspectRatios_size; ++r)
107+
{
108+
float ar = native_sqrt(_aspectRatios[r]);
109+
float scale = scales[(_maxSize > 0 ? 2 : 1) + r];
110+
111+
_boxWidth = _minSize * ar * scale;
112+
_boxHeight = _minSize / ar * scale;
113+
114+
for (int i = 0; i < offsetsX_size; ++i)
115+
{
116+
float center_x = (w + _offsetsX[i]) * stepX;
117+
float center_y = (h + _offsetsY[i]) * stepY;
118+
119+
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
120+
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
121+
vec.z = (center_x + _boxWidth * 0.5f) / imgWidth; // xmax
122+
vec.w = (center_y + _boxHeight * 0.5f) / imgHeight; // ymax
123+
vstore4(vec, 0, outputPtr);
124+
125+
outputPtr += 4;
126+
}
127+
}
128+
}
129+
}
130+
131+
__kernel void set_variance(const int nthreads,
132+
const int offset,
133+
const int variance_size,
134+
__global const Dtype* variance,
135+
__global Dtype* dst)
136+
{
137+
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
138+
{
139+
Dtype4 var_vec;
140+
141+
if (variance_size == 1)
142+
var_vec = (Dtype4)(variance[0]);
143+
else
144+
var_vec = vload4(0, variance);
145+
146+
vstore4(var_vec, 0, dst + offset + index * 4);
147+
}
148+
}

0 commit comments

Comments
 (0)