|
44 | 44 | #include "layers_common.hpp"
|
45 | 45 | #include <float.h>
|
46 | 46 | #include <algorithm>
|
| 47 | +#include "opencl_kernels_dnn.hpp" |
47 | 48 |
|
48 | 49 | namespace cv
|
49 | 50 | {
|
@@ -173,6 +174,24 @@ class PermuteLayerImpl : public PermuteLayer
|
173 | 174 | CV_Assert((int)_numAxes == inp0.dims);
|
174 | 175 |
|
175 | 176 | computeStrides(shape(*inputs[0]), shape(outputs[0]));
|
| 177 | + |
| 178 | +#ifdef HAVE_OPENCL |
| 179 | + if (uorder.empty()) |
| 180 | + { |
| 181 | + std::vector<int> orderVec(_order.begin(), _order.end());; |
| 182 | + Mat morder(1, orderVec.size(), CV_32SC1, &orderVec[0]); |
| 183 | + |
| 184 | + std::vector<int> oldStrideVec(_oldStride.begin(), _oldStride.end()); |
| 185 | + Mat mold_stride(1, _oldStride.size(), CV_32SC1, &oldStrideVec[0]); |
| 186 | + |
| 187 | + std::vector<int> newStrideVec(_newStride.begin(), _newStride.end()); |
| 188 | + Mat mnew_stride(1, newStrideVec.size(), CV_32SC1, &newStrideVec[0]); |
| 189 | + |
| 190 | + morder.copyTo(uorder); |
| 191 | + mold_stride.copyTo(uold_stride); |
| 192 | + mnew_stride.copyTo(unew_stride); |
| 193 | + } |
| 194 | +#endif |
176 | 195 | }
|
177 | 196 |
|
178 | 197 | class PermuteInvoker : public ParallelLoopBody
|
@@ -247,11 +266,47 @@ class PermuteLayerImpl : public PermuteLayer
|
247 | 266 | }
|
248 | 267 | };
|
249 | 268 |
|
| 269 | +#ifdef HAVE_OPENCL |
| 270 | + bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) |
| 271 | + { |
| 272 | + std::vector<UMat> inputs; |
| 273 | + std::vector<UMat> outputs; |
| 274 | + |
| 275 | + inps.getUMatVector(inputs); |
| 276 | + outs.getUMatVector(outputs); |
| 277 | + |
| 278 | + if (!_needsPermute) |
| 279 | + return false; |
| 280 | + |
| 281 | + for (size_t i = 0; i < inputs.size(); i++) |
| 282 | + { |
| 283 | + ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc); |
| 284 | + |
| 285 | + kernel.set(0, (int)_count); |
| 286 | + kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i])); |
| 287 | + kernel.set(2, ocl::KernelArg::PtrReadOnly(uorder)); |
| 288 | + kernel.set(3, ocl::KernelArg::PtrReadOnly(uold_stride)); |
| 289 | + kernel.set(4, ocl::KernelArg::PtrReadOnly(unew_stride)); |
| 290 | + kernel.set(5, (int)_numAxes); |
| 291 | + kernel.set(6, ocl::KernelArg::PtrWriteOnly(outputs[i])); |
| 292 | + |
| 293 | + if (!kernel.run(1, &_count, NULL, false)) |
| 294 | + return false; |
| 295 | + } |
| 296 | + |
| 297 | + return true; |
| 298 | + } |
| 299 | +#endif |
| 300 | + |
250 | 301 | void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
|
251 | 302 | {
|
252 | 303 | CV_TRACE_FUNCTION();
|
253 | 304 | CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
254 | 305 |
|
| 306 | + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && |
| 307 | + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), |
| 308 | + forward_ocl(inputs_arr, outputs_arr, internals_arr)) |
| 309 | + |
255 | 310 | Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
256 | 311 | }
|
257 | 312 |
|
@@ -325,6 +380,10 @@ class PermuteLayerImpl : public PermuteLayer
|
325 | 380 | std::vector<size_t> _newStride;
|
326 | 381 | bool _needsPermute;
|
327 | 382 |
|
| 383 | +#ifdef HAVE_OPENCL |
| 384 | + UMat uorder, uold_stride, unew_stride; |
| 385 | +#endif |
| 386 | + |
328 | 387 | size_t _numAxes;
|
329 | 388 | };
|
330 | 389 |
|
|
0 commit comments