diff --git a/src/backend/common/KernelInterface.hpp b/src/backend/common/KernelInterface.hpp index 0ead60a8cd..252f04f37f 100644 --- a/src/backend/common/KernelInterface.hpp +++ b/src/backend/common/KernelInterface.hpp @@ -64,6 +64,51 @@ class KernelInterface { virtual void copyToReadOnly(DevPtrType dst, DevPtrType src, size_t bytes) = 0; + /// \brief Copy data from device memory to read-only memory + /// + /// This function copies data of `bytes` size from the device pointer to a + /// read-only memory. + /// + /// \param[in] dst is the device pointer to which data will be copied + /// \param[in] src is the device pointer from which data will be copied + /// \param[in] srcXInBytes is offset in Bytes + /// \param[in] bytes are the number of bytes of data to be copied + virtual void copyToReadOnly(DevPtrType dst, DevPtrType src, + size_t srcXInBytes, size_t bytes) = 0; + + /// \brief Copy strided 2D data from device memory to read-only memory + /// + /// This function copies data of any 2D array from the device pointer to a + /// read-only memory. + /// + /// \param[in] dst is the device pointer to which data will be copied + /// \param[in] src is the device pointer from which data will be copied + /// \param[in] srcXInBytes is offset in Bytes + /// \param[in] srcPitchInBytes is strides[1] in Bytes + /// \param[in] height is the number of elements for dim[1] dst + /// \param[in] widthInBytes are #bytes of continous data to copy (dim[0]) + virtual void copyToReadOnly2D(DevPtrType dst, DevPtrType src, + size_t srcXInBytes, size_t srcPitchInBytes, + size_t height, size_t widthInBytes) = 0; + + /// \brief Copy strided 3D data from device memory to read-only memory + /// + /// This function copies data of any 3D array from the device pointer to a + /// read-only memory. + /// + /// \param[in] dst is the device pointer to which data will be copied + /// \param[in] src is the device pointer from which data will be copied + /// \param[in] srcXInBytes is offset in Bytes + /// \param[in] srcPitchInBytes is strides[1] in Bytes + /// \param[in] srcHeight is the number of elements ALLOCATED for dim[1] src + /// \param[in] depth is the number of elements for dim[2] dst + /// \param[in] height is the number of elements for dim[1] dst + /// \param[in] widthInBytes are #bytes of continous data to copy (dim[0]) + virtual void copyToReadOnly3D(DevPtrType dst, DevPtrType src, + size_t srcXInBytes, size_t srcPitchInBytes, + size_t srcHeight, size_t depth, size_t height, + size_t widthInBytes) = 0; + /// \brief Copy a single scalar to device memory /// /// This function copies a single value of type T from host variable diff --git a/src/backend/cpu/kernel/convolve.hpp b/src/backend/cpu/kernel/convolve.hpp index 62381dd749..5bb5478931 100644 --- a/src/backend/cpu/kernel/convolve.hpp +++ b/src/backend/cpu/kernel/convolve.hpp @@ -125,8 +125,8 @@ void one2one_3d(InT *optr, InT const *const iptr, AccT const *const fptr, } optr[koff + joff + i - iStart] = InT(accum); } // i loop ends here - } // j loop ends here - } // k loop ends here + } // j loop ends here + } // k loop ends here } template @@ -217,7 +217,6 @@ void convolve2_separable(InT *optr, InT const *const iptr, dim_t fDim, af::dim4 const &oStrides, af::dim4 const &sStrides, dim_t fStride) { UNUSED(orgDims); - UNUSED(sStrides); UNUSED(fStride); for (dim_t j = 0; j < oDims[1]; ++j) { dim_t jOff = j * oStrides[1]; @@ -237,14 +236,18 @@ void convolve2_separable(InT *optr, InT const *const iptr, dim_t offi = ci - f; bool isCIValid = offi >= 0 && offi < sDims[0]; bool isCJValid = cj >= 0 && cj < sDims[1]; - s_val = (isCJValid && isCIValid ? iptr[cj * sDims[0] + offi] - : scalar(0)); + s_val = + (isCJValid && isCIValid ? iptr[cj * sStrides.dims[1] + + offi * sStrides.dims[0]] + : scalar(0)); } else { dim_t offj = cj - f; bool isCIValid = ci >= 0 && ci < sDims[0]; bool isCJValid = offj >= 0 && offj < sDims[1]; - s_val = (isCJValid && isCIValid ? iptr[offj * sDims[0] + ci] - : scalar(0)); + s_val = + (isCJValid && isCIValid ? iptr[offj * sStrides.dims[1] + + ci * sStrides.dims[0]] + : scalar(0)); } accum += AccT(s_val * f_val); diff --git a/src/backend/cpu/kernel/fast.hpp b/src/backend/cpu/kernel/fast.hpp index 341ddbe701..7927dba060 100644 --- a/src/backend/cpu/kernel/fast.hpp +++ b/src/backend/cpu/kernel/fast.hpp @@ -28,7 +28,9 @@ inline int idx_x(int i) { return idx_y(i - 12); } -inline int idx(int y, int x, unsigned idim0) { return x * idim0 + y; } +inline int idx(int y, int x, unsigned istrides0, unsigned istrides1) { + return x * istrides1 + y * istrides0; +} // test_greater() // Tests if a pixel x > p + thr @@ -44,9 +46,10 @@ inline int test_smaller(float x, float p, float thr) { return (x < p - thr); } // Returns 1 when x > p + thr template inline int test_pixel(const T *image, const float p, float thr, int y, int x, - unsigned idim0) { - return -test_smaller((float)image[idx(y, x, idim0)], p, thr) + - test_greater((float)image[idx(y, x, idim0)], p, thr); + unsigned istrides0, unsigned istrides1) { + return -test_smaller((float)image[idx(y, x, istrides0, istrides1)], p, + thr) + + test_greater((float)image[idx(y, x, istrides0, istrides1)], p, thr); } // abs_diff() @@ -64,36 +67,53 @@ void locate_features(CParam in, Param score, Param x_out, unsigned *count, float const thr, unsigned const arc_length, unsigned const nonmax, unsigned const max_feat, unsigned const edge) { - af::dim4 in_dims = in.dims(); - T const *in_ptr = in.get(); + af::dim4 in_dims = in.dims(); + af::dim4 in_strides = in.strides(); + T const *in_ptr = in.get(); for (int y = edge; y < (int)(in_dims[0] - edge); y++) { for (int x = edge; x < (int)(in_dims[1] - edge); x++) { - float p = in_ptr[idx(y, x, in_dims[0])]; + float p = in_ptr[idx(y, x, in_strides[0], in_strides[1])]; // Start by testing opposite pixels of the circle that will result // in a non-kepoint int d; - d = test_pixel(in_ptr, p, thr, y - 3, x, in_dims[0]) | - test_pixel(in_ptr, p, thr, y + 3, x, in_dims[0]); + d = test_pixel(in_ptr, p, thr, y - 3, x, in_strides[0], + in_strides[1]) | + test_pixel(in_ptr, p, thr, y + 3, x, in_strides[0], + in_strides[1]); if (d == 0) continue; - d &= test_pixel(in_ptr, p, thr, y - 2, x + 2, in_dims[0]) | - test_pixel(in_ptr, p, thr, y + 2, x - 2, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y, x + 3, in_dims[0]) | - test_pixel(in_ptr, p, thr, y, x - 3, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y + 2, x + 2, in_dims[0]) | - test_pixel(in_ptr, p, thr, y - 2, x - 2, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y - 2, x + 2, in_strides[0], + in_strides[1]) | + test_pixel(in_ptr, p, thr, y + 2, x - 2, in_strides[0], + in_strides[1]); + d &= test_pixel(in_ptr, p, thr, y, x + 3, in_strides[0], + in_strides[1]) | + test_pixel(in_ptr, p, thr, y, x - 3, in_strides[0], + in_strides[1]); + d &= test_pixel(in_ptr, p, thr, y + 2, x + 2, in_strides[0], + in_strides[1]) | + test_pixel(in_ptr, p, thr, y - 2, x - 2, in_strides[0], + in_strides[1]); if (d == 0) continue; - d &= test_pixel(in_ptr, p, thr, y - 3, x + 1, in_dims[0]) | - test_pixel(in_ptr, p, thr, y + 3, x - 1, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y - 1, x + 3, in_dims[0]) | - test_pixel(in_ptr, p, thr, y + 1, x - 3, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y + 1, x + 3, in_dims[0]) | - test_pixel(in_ptr, p, thr, y - 1, x - 3, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y + 3, x + 1, in_dims[0]) | - test_pixel(in_ptr, p, thr, y - 3, x - 1, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y - 3, x + 1, in_strides[0], + in_strides[1]) | + test_pixel(in_ptr, p, thr, y + 3, x - 1, in_strides[0], + in_strides[1]); + d &= test_pixel(in_ptr, p, thr, y - 1, x + 3, in_strides[0], + in_strides[1]) | + test_pixel(in_ptr, p, thr, y + 1, x - 3, in_strides[0], + in_strides[1]); + d &= test_pixel(in_ptr, p, thr, y + 1, x + 3, in_strides[0], + in_strides[1]) | + test_pixel(in_ptr, p, thr, y - 1, x - 3, in_strides[0], + in_strides[1]); + d &= test_pixel(in_ptr, p, thr, y + 3, x + 1, in_strides[0], + in_strides[1]) | + test_pixel(in_ptr, p, thr, y - 3, x - 1, in_strides[0], + in_strides[1]); if (d == 0) continue; int sum = 0; @@ -101,7 +121,7 @@ void locate_features(CParam in, Param score, Param x_out, // Sum responses [-1, 0 or 1] of first arc_length pixels for (int i = 0; i < static_cast(arc_length); i++) sum += test_pixel(in_ptr, p, thr, y + idx_y(i), x + idx_x(i), - in_dims[0]); + in_strides[0], in_strides[1]); // Test maximum and mininmum responses of first segment of // arc_length pixels @@ -113,9 +133,10 @@ void locate_features(CParam in, Param score, Param x_out, // circle for (int i = arc_length; i < 16; i++) { sum -= test_pixel(in_ptr, p, thr, y + idx_y(i - arc_length), - x + idx_x(i - arc_length), in_dims[0]); + x + idx_x(i - arc_length), in_strides[0], + in_strides[1]); sum += test_pixel(in_ptr, p, thr, y + idx_y(i), x + idx_x(i), - in_dims[0]); + in_strides[0], in_strides[1]); max_sum = std::max(max_sum, sum); min_sum = std::min(min_sum, sum); } @@ -123,19 +144,20 @@ void locate_features(CParam in, Param score, Param x_out, // To completely test all possible segments, it's necessary to test // segments that include the top junction of the circle for (int i = 0; i < static_cast(arc_length - 1); i++) { - sum -= test_pixel( - in_ptr, p, thr, y + idx_y(16 - arc_length + i), - x + idx_x(16 - arc_length + i), in_dims[0]); + sum -= test_pixel(in_ptr, p, thr, + y + idx_y(16 - arc_length + i), + x + idx_x(16 - arc_length + i), + in_strides[0], in_strides[1]); sum += test_pixel(in_ptr, p, thr, y + idx_y(i), x + idx_x(i), - in_dims[0]); + in_strides[0], in_strides[1]); max_sum = std::max(max_sum, sum); min_sum = std::min(min_sum, sum); } float s_bright = 0, s_dark = 0; for (int i = 0; i < 16; i++) { - float p_x = - (float)in_ptr[idx(y + idx_y(i), x + idx_x(i), in_dims[0])]; + float p_x = (float)in_ptr[idx(y + idx_y(i), x + idx_x(i), + in_strides[0], in_strides[1])]; s_bright += test_greater(p_x, p, thr) * (abs_diff(p_x, p) - thr); @@ -159,7 +181,7 @@ void locate_features(CParam in, Param score, Param x_out, static_cast(std::max(s_bright, s_dark)); if (nonmax == 1) { float *score_ptr = score.get(); - score_ptr[idx(y, x, in_dims[0])] = + score_ptr[idx(y, x, 1, in_dims[0])] = std::max(s_bright, s_dark); } } diff --git a/src/backend/cpu/kernel/orb.hpp b/src/backend/cpu/kernel/orb.hpp index 385f71abb6..708caba7f3 100644 --- a/src/backend/cpu/kernel/orb.hpp +++ b/src/backend/cpu/kernel/orb.hpp @@ -119,8 +119,9 @@ void harris_response(float* x_out, float* y_out, float* score_out, unsigned* usable_feat, CParam image, const unsigned block_size, const float k_thr, const unsigned patch_size) { - const af::dim4 idims = image.dims(); - const T* image_ptr = image.get(); + const af::dim4 idims = image.dims(); + const af::dim4 istrides = image.strides(); + const T* image_ptr = image.get(); for (unsigned f = 0; f < total_feat; f++) { unsigned x, y; float scl = 1.f; @@ -154,10 +155,12 @@ void harris_response(float* x_out, float* y_out, float* score_out, int j = k % block_size - r; // Calculate local x and y derivatives - float ix = image_ptr[(x + i + 1) * idims[0] + y + j] - - image_ptr[(x + i - 1) * idims[0] + y + j]; - float iy = image_ptr[(x + i) * idims[0] + y + j + 1] - - image_ptr[(x + i) * idims[0] + y + j - 1]; + float ix = + image_ptr[(x + i + 1) * istrides[1] + (y + j) * istrides[0]] - + image_ptr[(x + i - 1) * istrides[1] + (y + j) * istrides[0]]; + float iy = + image_ptr[(x + i) * istrides[1] + (y + j + 1) * istrides[0]] - + image_ptr[(x + i) * istrides[1] + (y + j - 1) * istrides[0]]; // Accumulate second order derivatives ixx += ix * ix; @@ -189,8 +192,9 @@ template void centroid_angle(const float* x_in, const float* y_in, float* orientation_out, const unsigned total_feat, CParam image, const unsigned patch_size) { - const af::dim4 idims = image.dims(); - const T* image_ptr = image.get(); + const af::dim4 idims = image.dims(); + const af::dim4 istrides = image.strides(); + const T* image_ptr = image.get(); for (unsigned f = 0; f < total_feat; f++) { unsigned x = (unsigned)round(x_in[f]); unsigned y = (unsigned)round(y_in[f]); @@ -205,7 +209,7 @@ void centroid_angle(const float* x_in, const float* y_in, int j = k % patch_size - r; // Calculate first order moments - T p = image_ptr[(x + i) * idims[0] + y + j]; + T p = image_ptr[(x + i) * istrides[1] + (y + j) * istrides[0]]; m01 += j * p; m10 += i * p; } @@ -219,17 +223,17 @@ template inline T get_pixel(unsigned x, unsigned y, const float ori, const unsigned size, const int dist_x, const int dist_y, CParam image, const unsigned patch_size) { - const af::dim4 idims = image.dims(); - const T* image_ptr = image.get(); - float ori_sin = sin(ori); - float ori_cos = cos(ori); - float patch_scl = (float)size / (float)patch_size; + const af::dim4 istrides = image.strides(); + const T* image_ptr = image.get(); + float ori_sin = sin(ori); + float ori_cos = cos(ori); + float patch_scl = (float)size / (float)patch_size; // Calculate point coordinates based on orientation and size x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin); y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos); - return image_ptr[x * idims[0] + y]; + return image_ptr[x * istrides[1] + y * istrides[0]]; } template diff --git a/src/backend/cuda/Kernel.cpp b/src/backend/cuda/Kernel.cpp index d72672a1fc..25968fb646 100644 --- a/src/backend/cuda/Kernel.cpp +++ b/src/backend/cuda/Kernel.cpp @@ -26,6 +26,67 @@ void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src, CU_CHECK(cuMemcpyDtoDAsync(dst, src, bytes, getActiveStream())); } +void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src, + size_t srcXInBytes, size_t bytes) { + CU_CHECK(cuMemcpyDtoDAsync(dst, src, bytes, getActiveStream())); +} + +void Kernel::copyToReadOnly2D(Kernel::DevPtrType dst, Kernel::DevPtrType src, + size_t srcXInBytes, size_t srcPitchInBytes, + size_t height, size_t widthInBytes) { + CUDA_MEMCPY2D pCopy; + pCopy.srcXInBytes = srcXInBytes; + pCopy.srcY = 0; + pCopy.srcMemoryType = CU_MEMORYTYPE_DEVICE; + pCopy.srcDevice = src; + pCopy.srcPitch = srcPitchInBytes; + + pCopy.dstXInBytes = 0; + pCopy.dstY = 0; + pCopy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + pCopy.dstDevice = dst; + pCopy.dstPitch = widthInBytes; + + pCopy.WidthInBytes = widthInBytes; + pCopy.Height = height; + // CUdeviceptr srcStart = srcDevice + srcY*srcPitch + srcXInBytes; + // CUdeviceptr dstStart = dstDevice + dstY*dstPitch + dstXInBytes; + + CU_CHECK(cuMemcpy2DAsync(&pCopy, getActiveStream())); +} + +void Kernel::copyToReadOnly3D(Kernel::DevPtrType dst, Kernel::DevPtrType src, + size_t srcXInBytes, size_t srcPitchInBytes, + size_t srcHeight, size_t depth, size_t height, + size_t widthInBytes) { + CUDA_MEMCPY3D pCopy; + pCopy.srcXInBytes = srcXInBytes; + pCopy.srcY = 0; + pCopy.srcZ = 0; + pCopy.srcLOD = 0; + pCopy.srcMemoryType = CU_MEMORYTYPE_DEVICE; + pCopy.srcDevice = src; + pCopy.srcPitch = srcPitchInBytes; + pCopy.srcHeight = srcHeight; + + pCopy.dstXInBytes = 0; + pCopy.dstY = 0; + pCopy.dstZ = 0; + pCopy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + pCopy.dstDevice = dst; + pCopy.dstPitch = widthInBytes; + pCopy.dstHeight = height; + + pCopy.WidthInBytes = widthInBytes; + pCopy.Height = height; + pCopy.Depth = depth; + // CUdeviceptr srcStart = + // srcDevice + (srcZ*srcHeight+srcY)*srcPitch + srcXInBytes; + // CUdeviceptr dstStart = + // dstDevice + (dstZ*dstHeight+dstY)*dstPitch + dstXInBytes; + CU_CHECK(cuMemcpy3DAsync(&pCopy, getActiveStream())); +} + void Kernel::setFlag(Kernel::DevPtrType dst, int* scalarValPtr, const bool syncCopy) { CU_CHECK( diff --git a/src/backend/cuda/Kernel.hpp b/src/backend/cuda/Kernel.hpp index 2199292080..2d5ff6eb39 100644 --- a/src/backend/cuda/Kernel.hpp +++ b/src/backend/cuda/Kernel.hpp @@ -66,6 +66,18 @@ class Kernel void copyToReadOnly(DevPtrType dst, DevPtrType src, size_t bytes) final; + void copyToReadOnly(DevPtrType dst, DevPtrType src, size_t srcXInBytes, + size_t bytes) final; + + void copyToReadOnly2D(DevPtrType dst, DevPtrType src, size_t srcXInBytes, + size_t srcPitchInBytes, size_t height, + size_t widthInBytes) final; + + void copyToReadOnly3D(DevPtrType dst, DevPtrType src, size_t srcXInBytes, + size_t srcPitchInBytes, size_t srcHeight, + size_t depth, size_t height, + size_t widthInBytes) final; + void setFlag(DevPtrType dst, int* scalarValPtr, const bool syncCopy = false) final; diff --git a/src/backend/cuda/kernel/convolve.hpp b/src/backend/cuda/kernel/convolve.hpp index 38339f2de2..0af7ade897 100644 --- a/src/backend/cuda/kernel/convolve.hpp +++ b/src/backend/cuda/kernel/convolve.hpp @@ -120,7 +120,6 @@ void convolve_1d(conv_kparam_t& p, Param out, CParam sig, CParam filt, int f1Off = b1 * filt.strides[1]; const aT* fptr = filt.ptr + (f1Off + f2Off + f3Off); - // FIXME: case where filter array is strided auto constMemPtr = convolve1.getDevPtr(conv_c_name); convolve1.copyToReadOnly(constMemPtr, reinterpret_cast(fptr), @@ -145,28 +144,40 @@ void convolve_1d(conv_kparam_t& p, Param out, CParam sig, CParam filt, template void conv2Helper(const conv_kparam_t& p, Param out, CParam sig, - const aT* fptr, int f0, int f1, const bool expand) { - const bool isFilterSizeLt5 = (f0 <= 5 && f1 <= 5); - const bool isFilterGt5AndSq = (f0 == f1 && f0 > 5 && f0 < 18); + const aT* fptr, const dim_t* const fdims, + const dim_t* const fstrides, const bool expand) { + const bool isFilterSizeLt5 = (fdims[0] <= 5 && fdims[1] <= 5); + const bool isFilterGt5AndSq = + (fdims[0] == fdims[1] && fdims[0] > 5 && fdims[0] < 18); if (!(isFilterSizeLt5 || isFilterGt5AndSq)) { char errMessage[256]; snprintf(errMessage, sizeof(errMessage), - "\nCUDA Convolution doesn't support %dx%d kernel\n", f0, f1); + "\nCUDA Convolution doesn't support %lldx%lld kernel\n", + fdims[0], fdims[1]); CUDA_NOT_SUPPORTED(errMessage); } auto convolve2 = common::getKernel( "arrayfire::cuda::convolve2", {{convolve2_cuh_src}}, TemplateArgs(TemplateTypename(), TemplateTypename(), - TemplateArg(expand), TemplateArg(f0), TemplateArg(f1)), + TemplateArg(expand), TemplateArg(fdims[0]), + TemplateArg(fdims[1])), {{DefineValue(MAX_CONV1_FILTER_LEN), DefineValue(CONV_THREADS), DefineValue(CONV2_THREADS_X), DefineValue(CONV2_THREADS_Y)}}); - // FIXME: case where filter array is strided auto constMemPtr = convolve2.getDevPtr(conv_c_name); - convolve2.copyToReadOnly(constMemPtr, reinterpret_cast(fptr), - f0 * f1 * sizeof(aT)); + if (fstrides[1] == fdims[0]) { + // linear filter array + convolve2.copyToReadOnly(constMemPtr, + reinterpret_cast(fptr), + fdims[0] * fdims[1] * sizeof(aT)); + } else { + // strided filter array + convolve2.copyToReadOnly2D( + constMemPtr, reinterpret_cast(fptr), 0, + fstrides[1] * sizeof(aT), fdims[1], fdims[0] * sizeof(aT)); + } EnqueueArgs qArgs(p.mBlocks, p.mThreads, getActiveStream()); convolve2(qArgs, out, sig, p.mBlk_x, p.mBlk_y, p.o[1], p.o[2], p.s[1], @@ -192,7 +203,7 @@ void convolve_2d(conv_kparam_t& p, Param out, CParam sig, CParam filt, p.s[1] = (p.inHasNoOffset ? 0 : b2); p.s[2] = (p.inHasNoOffset ? 0 : b3); - conv2Helper(p, out, sig, fptr, filt.dims[0], filt.dims[1], + conv2Helper(p, out, sig, fptr, filt.dims, filt.strides, expand); } } @@ -218,11 +229,18 @@ void convolve_3d(conv_kparam_t& p, Param out, CParam sig, CParam filt, const aT* fptr = filt.ptr + f3Off; - // FIXME: case where filter array is strided auto constMemPtr = convolve3.getDevPtr(conv_c_name); - convolve3.copyToReadOnly( - constMemPtr, reinterpret_cast(fptr), filterSize); - + if (filt.strides[2] == filt.dims[0] * filt.dims[1]) { + // linear filter array + convolve3.copyToReadOnly( + constMemPtr, reinterpret_cast(fptr), filterSize); + } else { + // strided filter array + convolve3.copyToReadOnly3D( + constMemPtr, reinterpret_cast(fptr), 0, + filt.strides[1] * sizeof(aT), filt.strides[2] / filt.strides[1], + filt.dims[2], filt.dims[1], filt.dims[0] * sizeof(aT)); + } p.o[2] = (p.outHasNoOffset ? 0 : b3); p.s[2] = (p.inHasNoOffset ? 0 : b3); @@ -321,11 +339,26 @@ void convolve2(Param out, CParam signal, CParam filter, int conv_dim, dim3 blocks(blk_x * signal.dims[2], blk_y * signal.dims[3]); - // FIXME: case where filter array is strided auto constMemPtr = convolve2_separable.getDevPtr(sconv_c_name); - convolve2_separable.copyToReadOnly( - constMemPtr, reinterpret_cast(filter.ptr), - fLen * sizeof(aT)); + if (filter.strides[1] == filter.dims[0]) { + // linear filter array + convolve2_separable.copyToReadOnly( + constMemPtr, reinterpret_cast(filter.ptr), + fLen * sizeof(aT)); + } else { + // strided filter array 4D + const aT* fptr = filter.ptr; + CUdeviceptr dptr = constMemPtr; + for (dim_t d3 = 0; d3 < filter.dims[3]; ++d3, fptr += filter.strides[3], + dptr += filter.dims[0] * filter.dims[1] * filter.dims[2] * + sizeof(aT)) { + convolve2_separable.copyToReadOnly3D( + dptr, reinterpret_cast(fptr), 0, + filter.strides[1] * sizeof(aT), + filter.strides[2] + filter.strides[1], filter.dims[2], + filter.dims[1], filter.dims[0] * sizeof(aT)); + } + } EnqueueArgs qArgs(blocks, threads, getActiveStream()); convolve2_separable(qArgs, out, signal, blk_x, blk_y); diff --git a/src/backend/cuda/kernel/fast.hpp b/src/backend/cuda/kernel/fast.hpp index 7b54162b42..c98726d0a1 100644 --- a/src/backend/cuda/kernel/fast.hpp +++ b/src/backend/cuda/kernel/fast.hpp @@ -177,16 +177,17 @@ __device__ void load_shared_image(CParam in, T *local_image, unsigned ix, // Copy an image patch to shared memory, with a 3-pixel edge if (ix < lx && iy < ly && x - 3 < in.dims[0] && y - 3 < in.dims[1]) { local_image[(ix) + (bx + 6) * (iy)] = - in.ptr[(x - 3) + in.dims[0] * (y - 3)]; + in.ptr[in.strides[0] * (x - 3) + in.strides[1] * (y - 3)]; if (x + lx - 3 < in.dims[0]) local_image[(ix + lx) + (bx + 6) * (iy)] = - in.ptr[(x + lx - 3) + in.dims[0] * (y - 3)]; + in.ptr[in.strides[0] * (x + lx - 3) + in.strides[1] * (y - 3)]; if (y + ly - 3 < in.dims[1]) local_image[(ix) + (bx + 6) * (iy + ly)] = - in.ptr[(x - 3) + in.dims[0] * (y + ly - 3)]; + in.ptr[in.strides[0] * (x - 3) + in.strides[1] * (y + ly - 3)]; if (x + lx - 3 < in.dims[0] && y + ly - 3 < in.dims[1]) local_image[(ix + lx) + (bx + 6) * (iy + ly)] = - in.ptr[(x + lx - 3) + in.dims[0] * (y + ly - 3)]; + in.ptr[in.strides[0] * (x + lx - 3) + + in.strides[1] * (y + ly - 3)]; } } diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp index e956f02441..072657550d 100644 --- a/src/backend/cuda/kernel/harris.hpp +++ b/src/backend/cuda/kernel/harris.hpp @@ -174,7 +174,7 @@ void harris(unsigned* corners_out, float** x_out, float** y_out, filter.strides[k] = filter.dims[k - 1] * filter.strides[k - 1]; } - int filter_elem = filter.strides[3] * filter.dims[3]; + int filter_elem = filter.dims[0] * filter.dims[1]; auto filter_alloc = memAlloc(filter_elem); filter.ptr = filter_alloc.get(); CUDA_CHECK(cudaMemcpyAsync(filter.ptr, h_filter.data(), @@ -184,9 +184,11 @@ void harris(unsigned* corners_out, float** x_out, float** y_out, const unsigned border_len = filter_len / 2 + 1; Param ix, iy; - for (dim_t i = 0; i < 4; i++) { + ix.dims[0] = iy.dims[0] = in.dims[0]; + ix.strides[0] = iy.strides[0] = 1; + for (dim_t i = 1; i < 4; i++) { ix.dims[i] = iy.dims[i] = in.dims[i]; - ix.strides[i] = iy.strides[i] = in.strides[i]; + ix.strides[i] = iy.strides[i] = ix.strides[i - 1] * ix.dims[i - 1]; } auto ix_alloc = memAlloc(ix.dims[3] * ix.strides[3]); auto iy_alloc = memAlloc(iy.dims[3] * iy.strides[3]); @@ -198,12 +200,16 @@ void harris(unsigned* corners_out, float** x_out, float** y_out, Param ixx, ixy, iyy; Param ixx_tmp, ixy_tmp, iyy_tmp; - for (dim_t i = 0; i < 4; i++) { - ixx.dims[i] = ixy.dims[i] = iyy.dims[i] = in.dims[i]; - ixx_tmp.dims[i] = ixy_tmp.dims[i] = iyy_tmp.dims[i] = in.dims[i]; - ixx.strides[i] = ixy.strides[i] = iyy.strides[i] = in.strides[i]; - ixx_tmp.strides[i] = ixy_tmp.strides[i] = iyy_tmp.strides[i] = - in.strides[i]; + ixx.dims[0] = ixy.dims[0] = iyy.dims[0] = ixx_tmp.dims[0] = + ixy_tmp.dims[0] = iyy_tmp.dims[0] = in.dims[0]; + ixx.strides[0] = ixy.strides[0] = iyy.strides[0] = ixx_tmp.strides[0] = + ixy_tmp.strides[0] = iyy_tmp.strides[0] = 1; + for (dim_t i = 1; i < 4; i++) { + ixx.dims[i] = ixy.dims[i] = iyy.dims[i] = ixx_tmp.dims[i] = + ixy_tmp.dims[i] = iyy_tmp.dims[i] = in.dims[i]; + ixx.strides[i] = ixy.strides[i] = iyy.strides[i] = ixx_tmp.strides[i] = + ixy_tmp.strides[i] = iyy_tmp.strides[i] = + ixx.strides[i - 1] * ixx.dims[i - 1]; } auto ixx_alloc = memAlloc(ixx.dims[3] * ixx.strides[3]); auto ixy_alloc = memAlloc(ixy.dims[3] * ixy.strides[3]); @@ -214,9 +220,9 @@ void harris(unsigned* corners_out, float** x_out, float** y_out, // Compute second-order derivatives dim3 threads(THREADS_PER_BLOCK, 1); - dim3 blocks(divup(in.dims[3] * in.strides[3], threads.x), 1); + dim3 blocks(divup(in.dims[0] * in.dims[1], threads.x), 1); CUDA_LAUNCH((second_order_deriv), blocks, threads, ixx.ptr, ixy.ptr, - iyy.ptr, in.dims[3] * in.strides[3], ix.ptr, iy.ptr); + iyy.ptr, in.dims[0] * in.dims[1], ix.ptr, iy.ptr); auto ixx_tmp_alloc = memAlloc(ixx_tmp.dims[3] * ixx_tmp.strides[3]); auto ixy_tmp_alloc = memAlloc(ixy_tmp.dims[3] * ixy_tmp.strides[3]); @@ -235,7 +241,7 @@ void harris(unsigned* corners_out, float** x_out, float** y_out, // Number of corners is not known a priori, limit maximum number of corners // according to image dimensions - unsigned corner_lim = in.dims[3] * in.strides[3] * 0.2f; + unsigned corner_lim = in.dims[0] * in.dims[1] * 0.2f; auto d_corners_found = memAlloc(1); CUDA_CHECK(cudaMemsetAsync(d_corners_found.get(), 0, sizeof(unsigned), @@ -245,7 +251,7 @@ void harris(unsigned* corners_out, float** x_out, float** y_out, auto d_y_corners = memAlloc(corner_lim); auto d_resp_corners = memAlloc(corner_lim); - auto d_responses = memAlloc(in.dims[3] * in.strides[3]); + auto d_responses = memAlloc(in.dims[0] * in.dims[1]); // Calculate Harris responses for all pixels threads = dim3(BLOCK_SIZE, BLOCK_SIZE); diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp index c1df7620f5..e7da4eadb9 100644 --- a/src/backend/cuda/kernel/orb.hpp +++ b/src/backend/cuda/kernel/orb.hpp @@ -125,10 +125,14 @@ __global__ void harris_response(float* score_out, float* size_out, int j = k % block_size - r; // Calculate local x and y derivatives - float ix = image.ptr[(x + i + 1) * image.dims[0] + y + j] - - image.ptr[(x + i - 1) * image.dims[0] + y + j]; - float iy = image.ptr[(x + i) * image.dims[0] + y + j + 1] - - image.ptr[(x + i) * image.dims[0] + y + j - 1]; + float ix = image.ptr[(x + i + 1) * image.strides[1] + + (y + j) * image.strides[0]] - + image.ptr[(x + i - 1) * image.strides[1] + + (y + j) * image.strides[0]]; + float iy = image.ptr[(x + i) * image.strides[1] + + (y + j + 1) * image.strides[0]] - + image.ptr[(x + i) * image.strides[1] + + (y + j - 1) * image.strides[0]]; // Accumulate second order derivatives ixx += ix * ix; @@ -181,7 +185,8 @@ __global__ void centroid_angle(const float* x_in, const float* y_in, int j = k % patch_size - r; // Calculate first order moments - T p = image.ptr[(x + i) * image.dims[0] + y + j]; + T p = image.ptr[(x + i) * image.strides[1] + + (y + j) * image.strides[0]]; m01 += j * p; m10 += i * p; } @@ -209,7 +214,7 @@ inline __device__ T get_pixel(unsigned x, unsigned y, const float ori, x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin); y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos); - return image.ptr[x * image.dims[0] + y]; + return image.ptr[x * image.strides[1] + y * image.strides[0]]; } inline __device__ int lookup(const int n, cudaTextureObject_t tex) { diff --git a/src/backend/oneapi/kernel/convolve.hpp b/src/backend/oneapi/kernel/convolve.hpp index ebec7dbe88..b582116726 100644 --- a/src/backend/oneapi/kernel/convolve.hpp +++ b/src/backend/oneapi/kernel/convolve.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -75,7 +76,7 @@ void prepareKernelArgs(conv_kparam_t ¶m, dim_t *oDims, param.nBBS0 = divup(oDims[0], THREADS); param.nBBS1 = batchDims[2]; param.global = range<3>(param.nBBS0 * THREADS * batchDims[1], - param.nBBS1 * batchDims[3], 1); + param.nBBS1 * batchDims[3], 1); param.loc_size = (THREADS + 2 * (fDims[0] - 1)); } else if (rank == 2) { param.local = range<3>{THREADS_X, THREADS_Y, 1}; @@ -89,26 +90,13 @@ void prepareKernelArgs(conv_kparam_t ¶m, dim_t *oDims, param.nBBS1 = divup(oDims[1], CUBE_Y); int blk_z = divup(oDims[2], CUBE_Z); param.global = range<3>(param.nBBS0 * CUBE_X * batchDims[3], - param.nBBS1 * CUBE_Y, blk_z * CUBE_Z); + param.nBBS1 * CUBE_Y, blk_z * CUBE_Z); param.loc_size = (CUBE_X + 2 * (fDims[0] - 1)) * (CUBE_Y + 2 * (fDims[1] - 1)) * (CUBE_Z + 2 * (fDims[2] - 1)); } } -template -void memcpyBuffer(sycl::buffer &dest, sycl::buffer &src, - const size_t n, const size_t srcOffset) { - getQueue().submit([&](auto &h) { - sycl::accessor srcAcc{src, h, sycl::range{n}, sycl::id{srcOffset}, - sycl::read_only}; - sycl::accessor destAcc{ - dest, h, sycl::range{n}, sycl::id{0}, sycl::write_only, - sycl::no_init}; - h.copy(srcAcc, destAcc); - }); -} - #include "convolve1.hpp" #include "convolve2.hpp" #include "convolve3.hpp" diff --git a/src/backend/oneapi/kernel/convolve1.hpp b/src/backend/oneapi/kernel/convolve1.hpp index 41c6facae6..7f41d29d20 100644 --- a/src/backend/oneapi/kernel/convolve1.hpp +++ b/src/backend/oneapi/kernel/convolve1.hpp @@ -134,20 +134,23 @@ void conv1Helper(const conv_kparam_t ¶m, Param &out, template void conv1(conv_kparam_t &p, Param &out, const Param &sig, const Param &filt, const bool expand) { - const size_t se_size = filt.info.dims[0]; - sycl::buffer impulse{sycl::range(filt.info.dims[0])}; - int f0Off = filt.info.offset; + const dim_t se_size = filt.info.dims[0]; + sycl::buffer impulse{sycl::range(se_size)}; + const dim_t mstrides[4] = {1, se_size, se_size, se_size}; + const dim_t mdims[4] = {filt.info.dims[0], 1, 1, 1}; + const dim_t f0Off = filt.info.offset; for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) { - int f3Off = b3 * filt.info.strides[3]; + const dim_t f3Off = b3 * filt.info.strides[3]; for (int b2 = 0; b2 < filt.info.dims[2]; ++b2) { - int f2Off = b2 * filt.info.strides[2]; + const dim_t f2Off = b2 * filt.info.strides[2]; for (int b1 = 0; b1 < filt.info.dims[1]; ++b1) { - int f1Off = b1 * filt.info.strides[1]; + const dim_t f1Off = b1 * filt.info.strides[1]; - const size_t srcOffset = f0Off + f1Off + f2Off + f3Off; - memcpyBuffer(impulse, *filt.data, se_size, srcOffset); + const dim_t srcOffset = f0Off + f1Off + f2Off + f3Off; + kernel::memcopy(&impulse, mstrides, filt.data, mdims, + filt.info.strides, srcOffset, 1); p.impulse = &impulse; p.o[0] = (p.outHasNoOffset ? 0 : b1); diff --git a/src/backend/oneapi/kernel/convolve2.hpp b/src/backend/oneapi/kernel/convolve2.hpp index 45bfa6c108..cf5f17ed1a 100644 --- a/src/backend/oneapi/kernel/convolve2.hpp +++ b/src/backend/oneapi/kernel/convolve2.hpp @@ -155,18 +155,21 @@ void conv2Helper(const conv_kparam_t ¶m, Param out, template void conv2(conv_kparam_t &p, Param &out, const Param &sig, const Param &filt, const bool expand) { - size_t se_size = filt.info.dims[0] * filt.info.dims[1]; + const dim_t se_size = filt.info.dims[0] * filt.info.dims[1]; sycl::buffer impulse{sycl::range(se_size)}; - int f0Off = filt.info.offset; + const dim_t mstrides[4] = {1, filt.info.dims[0], se_size, se_size}; + const dim_t mdims[4] = {filt.info.dims[0], filt.info.dims[1], 1, 1}; + const dim_t f0Off = filt.info.offset; for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) { - int f3Off = b3 * filt.info.strides[3]; + const dim_t f3Off = b3 * filt.info.strides[3]; for (int b2 = 0; b2 < filt.info.dims[2]; ++b2) { - int f2Off = b2 * filt.info.strides[2]; + const dim_t f2Off = b2 * filt.info.strides[2]; - const size_t srcOffset = f2Off + f3Off + f0Off; - memcpyBuffer(impulse, *filt.data, se_size, srcOffset); + const dim_t srcOffset = f2Off + f3Off + f0Off; + kernel::memcopy(&impulse, mstrides, filt.data, mdims, + filt.info.strides, srcOffset, 2); p.impulse = &impulse; p.o[1] = (p.outHasNoOffset ? 0 : b2); diff --git a/src/backend/oneapi/kernel/convolve3.hpp b/src/backend/oneapi/kernel/convolve3.hpp index bdfcc4eb24..2b402ab32b 100644 --- a/src/backend/oneapi/kernel/convolve3.hpp +++ b/src/backend/oneapi/kernel/convolve3.hpp @@ -164,15 +164,21 @@ void conv3Helper(const conv_kparam_t ¶m, Param &out, template void conv3(conv_kparam_t &p, Param &out, const Param &sig, const Param &filt, const bool expand) { - size_t se_size = filt.info.dims[0] * filt.info.dims[1] * filt.info.dims[2]; + const dim_t se_size = + filt.info.dims[0] * filt.info.dims[1] * filt.info.dims[2]; sycl::buffer impulse{sycl::range(se_size)}; - int f0Off = filt.info.offset; + const dim_t mstrides[4] = {1, filt.info.dims[0], + filt.info.dims[0] * filt.info.dims[1], se_size}; + const dim_t mdims[4] = {filt.info.dims[0], filt.info.dims[1], + filt.info.dims[2], 1}; + const dim_t f0Off = filt.info.offset; for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) { - int f3Off = b3 * filt.info.strides[3]; + const dim_t f3Off = b3 * filt.info.strides[3]; - const size_t srcOffset = f3Off + f0Off; - memcpyBuffer(impulse, *filt.data, se_size, srcOffset); + const dim_t srcOffset = f3Off + f0Off; + kernel::memcopy(&impulse, mstrides, filt.data, mdims, filt.info.strides, + srcOffset, 3); p.impulse = &impulse; p.o[2] = (p.outHasNoOffset ? 0 : b3); diff --git a/src/backend/oneapi/kernel/convolve_separable.cpp b/src/backend/oneapi/kernel/convolve_separable.cpp index 0f3dfacb30..5b9d1d6453 100644 --- a/src/backend/oneapi/kernel/convolve_separable.cpp +++ b/src/backend/oneapi/kernel/convolve_separable.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -136,19 +137,6 @@ class convolveSeparableCreateKernel { sycl::local_accessor localMem_; }; -template -void memcpyBuffer(sycl::buffer &dest, sycl::buffer &src, - const size_t n, const size_t srcOffset) { - getQueue().submit([&](auto &h) { - sycl::accessor srcAcc{src, h, sycl::range{n}, sycl::id{srcOffset}, - sycl::read_only}; - sycl::accessor destAcc{ - dest, h, sycl::range{n}, sycl::id{0}, sycl::write_only, - sycl::no_init}; - h.copy(srcAcc, destAcc); - }); -} - template void convSep(Param out, const Param signal, const Param filter, const int conv_dim, const bool expand) { @@ -161,7 +149,7 @@ void convSep(Param out, const Param signal, const Param filter, constexpr int THREADS_X = 16; constexpr int THREADS_Y = 16; - const int fLen = filter.info.dims[0] * filter.info.dims[1]; + const dim_t fLen = filter.info.dims[0] * filter.info.dims[1]; const size_t C0_SIZE = (THREADS_X + 2 * (fLen - 1)) * THREADS_Y; const size_t C1_SIZE = (THREADS_Y + 2 * (fLen - 1)) * THREADS_X; size_t locSize = (conv_dim == 0 ? C0_SIZE : C1_SIZE); @@ -175,7 +163,10 @@ void convSep(Param out, const Param signal, const Param filter, blk_y * signal.info.dims[3] * THREADS_Y); sycl::buffer mBuff = {sycl::range(fLen * sizeof(accType))}; - memcpyBuffer(mBuff, *filter.data, fLen, 0); + const dim_t mstrides[4] = {1, filter.info.dims[0], fLen, fLen}; + const dim_t mdims[4] = {filter.info.dims[0], filter.info.dims[1], 1, 1}; + kernel::memcopy(&mBuff, mstrides, filter.data, mdims, filter.info.strides, + filter.info.offset, 2); getQueue().submit([&](auto &h) { sycl::accessor d_signal{*signal.data, h, sycl::read_only}; diff --git a/src/backend/opencl/Kernel.cpp b/src/backend/opencl/Kernel.cpp index b5d818b6d2..e8286c5329 100644 --- a/src/backend/opencl/Kernel.cpp +++ b/src/backend/opencl/Kernel.cpp @@ -27,6 +27,55 @@ void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src, getQueue().enqueueCopyBuffer(*src, *dst, 0, 0, bytes); } +void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src, + size_t srcXInBytes, size_t bytes) { + getQueue().enqueueCopyBuffer(*src, *dst, srcXInBytes, 0, bytes); +} + +void Kernel::copyToReadOnly2D(Kernel::DevPtrType dst, Kernel::DevPtrType src, + size_t srcXInBytes, size_t srcPitchInBytes, + size_t height, size_t widthInBytes) { + std::array src_origin = {srcXInBytes, 0, 0}; + size_t src_row_pitch = {srcPitchInBytes}; + size_t src_slice_pitch = {0}; + + std::array dst_origin = {0, 0, 0}; + size_t dst_row_pitch = {widthInBytes}; + size_t dst_slice_pitch = {0}; + + std::array region = {widthInBytes, height, 1}; + + // offset in bytes = + // src_origin[1]*src_row_pitch + src_origin[0] + + getQueue().enqueueCopyBufferRect(*src, *dst, src_origin, dst_origin, region, + src_row_pitch, src_slice_pitch, + dst_row_pitch, dst_slice_pitch); +} + +void Kernel::copyToReadOnly3D(Kernel::DevPtrType dst, Kernel::DevPtrType src, + size_t srcXInBytes, size_t srcPitchInBytes, + size_t srcHeight, size_t depth, size_t height, + size_t widthInBytes) { + std::array src_origin = {srcXInBytes, 0, 0}; + size_t src_row_pitch = {srcPitchInBytes}; + size_t src_slice_pitch = {srcHeight * srcPitchInBytes}; + + std::array dst_origin = {0, 0, 0}; + size_t dst_row_pitch = {widthInBytes}; + size_t dst_slice_pitch = {height * widthInBytes}; + + std::array region = {widthInBytes, height, depth}; + + // offset in bytes = + // src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + + // src_origin[0] + + getQueue().enqueueCopyBufferRect(*src, *dst, src_origin, dst_origin, region, + src_row_pitch, src_slice_pitch, + dst_row_pitch, dst_slice_pitch); +} + void Kernel::setFlag(Kernel::DevPtrType dst, int* scalarValPtr, const bool syncCopy) { UNUSED(syncCopy); diff --git a/src/backend/opencl/Kernel.hpp b/src/backend/opencl/Kernel.hpp index c5582d8f1c..9fc05ce3e1 100644 --- a/src/backend/opencl/Kernel.hpp +++ b/src/backend/opencl/Kernel.hpp @@ -54,6 +54,18 @@ class Kernel void copyToReadOnly(DevPtrType dst, DevPtrType src, size_t bytes) final; + void copyToReadOnly(DevPtrType dst, DevPtrType src, size_t srcXInBytes, + size_t bytes) final; + + void copyToReadOnly2D(DevPtrType dst, DevPtrType src, size_t srcXInBytes, + size_t srcPitchInBytes, size_t height, + size_t widthInBytes) final; + + void copyToReadOnly3D(DevPtrType dst, DevPtrType src, size_t srcXInBytes, + size_t srcPitchInBytes, size_t srcHeight, + size_t depth, size_t height, + size_t widthInBytes) final; + void setFlag(DevPtrType dst, int* scalarValPtr, const bool syncCopy = false) final; diff --git a/src/backend/opencl/kernel/convolve/conv1.cpp b/src/backend/opencl/kernel/convolve/conv1.cpp index 5bfa9668d6..d3f2f05f29 100644 --- a/src/backend/opencl/kernel/convolve/conv1.cpp +++ b/src/backend/opencl/kernel/convolve/conv1.cpp @@ -18,22 +18,16 @@ void conv1(conv_kparam_t& p, Param& out, const Param& sig, const Param& filt, const bool expand) { size_t se_size = filt.info.dims[0] * sizeof(aT); p.impulse = bufferAlloc(se_size); - int f0Off = filt.info.offset; + dim_t f0Off = filt.info.offset; for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) { - int f3Off = b3 * filt.info.strides[3]; + dim_t f3Off = b3 * filt.info.strides[3]; for (int b2 = 0; b2 < filt.info.dims[2]; ++b2) { - int f2Off = b2 * filt.info.strides[2]; + dim_t f2Off = b2 * filt.info.strides[2]; for (int b1 = 0; b1 < filt.info.dims[1]; ++b1) { - int f1Off = b1 * filt.info.strides[1]; - - // FIXME: if the filter array is strided, direct copy of symbols - // might cause issues - getQueue().enqueueCopyBuffer( - *filt.data, *p.impulse, - (f0Off + f1Off + f2Off + f3Off) * sizeof(aT), 0, se_size); + dim_t f1Off = b1 * filt.info.strides[1]; p.o[0] = (p.outHasNoOffset ? 0 : b1); p.o[1] = (p.outHasNoOffset ? 0 : b2); @@ -42,7 +36,9 @@ void conv1(conv_kparam_t& p, Param& out, const Param& sig, const Param& filt, p.s[1] = (p.inHasNoOffset ? 0 : b2); p.s[2] = (p.inHasNoOffset ? 0 : b3); - convNHelper(p, out, sig, filt, 1, expand); + convNHelper(p, out, sig, filt, + (f0Off + f1Off + f2Off + f3Off) * sizeof(aT), + 1, expand); } } } diff --git a/src/backend/opencl/kernel/convolve/conv2_impl.hpp b/src/backend/opencl/kernel/convolve/conv2_impl.hpp index 9798714750..12f2c2fdf8 100644 --- a/src/backend/opencl/kernel/convolve/conv2_impl.hpp +++ b/src/backend/opencl/kernel/convolve/conv2_impl.hpp @@ -18,7 +18,8 @@ namespace kernel { template void conv2Helper(const conv_kparam_t& param, Param out, const Param signal, - const Param filter, const bool expand) { + const Param filter, const size_t srcXInBytes, + const bool expand) { using cl::EnqueueArgs; using cl::NDRange; using std::string; @@ -27,8 +28,8 @@ void conv2Helper(const conv_kparam_t& param, Param out, const Param signal, constexpr bool IsComplex = std::is_same::value || std::is_same::value; - const int f0 = filter.info.dims[0]; - const int f1 = filter.info.dims[1]; + const dim_t f0 = filter.info.dims[0]; + const dim_t f1 = filter.info.dims[1]; const size_t LOC_SIZE = (THREADS_X + 2 * (f0 - 1)) * (THREADS_Y + 2 * (f1 - 1)); @@ -54,6 +55,17 @@ void conv2Helper(const conv_kparam_t& param, Param out, const Param signal, auto convolve = common::getKernel( "convolve", {{ops_cl_src, convolve_cl_src}}, tmpltArgs, compileOpts); + if (filter.info.strides[1] == f0) { + // 2D linear filter array + convolve.copyToReadOnly(param.impulse, filter.data, srcXInBytes, + f0 * f1 * sizeof(aT)); + } else { + // 2D strided filter array + convolve.copyToReadOnly2D(param.impulse, filter.data, srcXInBytes, + filter.info.strides[1] * sizeof(aT), f1, + f0 * sizeof(aT)); + } + convolve(EnqueueArgs(getQueue(), param.global, param.local), *out.data, out.info, *signal.data, signal.info, *param.impulse, filter.info, param.nBBS0, param.nBBS1, param.o[1], param.o[2], param.s[1], @@ -65,26 +77,21 @@ void conv2(conv_kparam_t& p, Param& out, const Param& sig, const Param& filt, const bool expand) { size_t se_size = filt.info.dims[0] * filt.info.dims[1] * sizeof(aT); p.impulse = bufferAlloc(se_size); - int f0Off = filt.info.offset; + dim_t f0Off = filt.info.offset; for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) { - int f3Off = b3 * filt.info.strides[3]; + dim_t f3Off = b3 * filt.info.strides[3]; for (int b2 = 0; b2 < filt.info.dims[2]; ++b2) { - int f2Off = b2 * filt.info.strides[2]; - - // FIXME: if the filter array is strided, direct copy of symbols - // might cause issues - getQueue().enqueueCopyBuffer(*filt.data, *p.impulse, - (f2Off + f3Off + f0Off) * sizeof(aT), - 0, se_size); + dim_t f2Off = b2 * filt.info.strides[2]; p.o[1] = (p.outHasNoOffset ? 0 : b2); p.o[2] = (p.outHasNoOffset ? 0 : b3); p.s[1] = (p.inHasNoOffset ? 0 : b2); p.s[2] = (p.inHasNoOffset ? 0 : b3); - conv2Helper(p, out, sig, filt, expand); + conv2Helper(p, out, sig, filt, + (f2Off + f3Off + f0Off) * sizeof(aT), expand); } } } diff --git a/src/backend/opencl/kernel/convolve/conv3.cpp b/src/backend/opencl/kernel/convolve/conv3.cpp index 1383e8f443..4bc545f34f 100644 --- a/src/backend/opencl/kernel/convolve/conv3.cpp +++ b/src/backend/opencl/kernel/convolve/conv3.cpp @@ -23,15 +23,11 @@ void conv3(conv_kparam_t& p, Param& out, const Param& sig, const Param& filt, for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) { int f3Off = b3 * filt.info.strides[3]; - // FIXME: if the filter array is strided, direct copy of symbols - // might cause issues - getQueue().enqueueCopyBuffer(*filt.data, *p.impulse, - (f0Off + f3Off) * sizeof(aT), 0, se_size); + p.o[2] = (p.outHasNoOffset ? 0 : b3); + p.s[2] = (p.inHasNoOffset ? 0 : b3); - p.o[2] = (p.outHasNoOffset ? 0 : b3); - p.s[2] = (p.inHasNoOffset ? 0 : b3); - - convNHelper(p, out, sig, filt, 3, expand); + convNHelper(p, out, sig, filt, (f0Off + f3Off) * sizeof(aT), 3, + expand); } } diff --git a/src/backend/opencl/kernel/convolve/conv_common.hpp b/src/backend/opencl/kernel/convolve/conv_common.hpp index bd93419c7c..dbd96d3168 100644 --- a/src/backend/opencl/kernel/convolve/conv_common.hpp +++ b/src/backend/opencl/kernel/convolve/conv_common.hpp @@ -87,12 +87,17 @@ void prepareKernelArgs(conv_kparam_t& param, dim_t* oDims, const dim_t* fDims, template void convNHelper(const conv_kparam_t& param, Param& out, const Param& signal, - const Param& filter, const int rank, const bool expand) { + const Param& filter, const size_t srcXInBytes, const int rank, + const bool expand) { using cl::EnqueueArgs; using cl::NDRange; using std::string; using std::vector; + const dim_t f0 = filter.info.dims[0]; + const dim_t f1 = filter.info.dims[1]; + const dim_t f2 = filter.info.dims[2]; + constexpr bool IsComplex = std::is_same::value || std::is_same::value; @@ -117,6 +122,27 @@ void convNHelper(const conv_kparam_t& param, Param& out, const Param& signal, auto convolve = common::getKernel( "convolve", {{ops_cl_src, convolve_cl_src}}, tmpltArgs, compileOpts); + switch (rank) { + case 1: + convolve.copyToReadOnly(param.impulse, filter.data, srcXInBytes, + f0 * sizeof(aT)); + break; + case 3: + if (filter.info.strides[2] == f0 * f1) { + // 3D linear filter array + convolve.copyToReadOnly(param.impulse, filter.data, srcXInBytes, + f0 * f1 * f2 * sizeof(aT)); + } else { + // 3D strided filter array + convolve.copyToReadOnly3D( + param.impulse, filter.data, srcXInBytes, + filter.info.strides[1] * sizeof(aT), + filter.info.strides[2] / filter.info.strides[1], f2, f1, + f0 * sizeof(aT)); + } + break; + } + convolve(EnqueueArgs(getQueue(), param.global, param.local), *out.data, out.info, *signal.data, signal.info, cl::Local(param.loc_size), *param.impulse, filter.info, param.nBBS0, param.nBBS1, param.o[0], diff --git a/src/backend/opencl/kernel/convolve_separable.cpp b/src/backend/opencl/kernel/convolve_separable.cpp index 83a9116d72..7800e64ccf 100644 --- a/src/backend/opencl/kernel/convolve_separable.cpp +++ b/src/backend/opencl/kernel/convolve_separable.cpp @@ -76,9 +76,18 @@ void convSep(Param out, const Param signal, const Param filter, blk_y * signal.info.dims[3] * THREADS_Y); cl::Buffer *mBuff = bufferAlloc(fLen * sizeof(accType)); - // FIX ME: if the filter array is strided, direct might cause issues - getQueue().enqueueCopyBuffer(*filter.data, *mBuff, 0, 0, - fLen * sizeof(accType)); + if (fLen == filter.info.strides[2]) { + // Linear 2D filter array + conv.copyToReadOnly(mBuff, filter.data, + filter.info.offset * sizeof(accType), + fLen * sizeof(accType)); + } else { + // Strided 2D filter array + conv.copyToReadOnly2D( + mBuff, filter.data, filter.info.offset * sizeof(accType), + filter.info.strides[1] * sizeof(accType), filter.info.dims[1], + filter.info.dims[0] * sizeof(accType)); + } conv(cl::EnqueueArgs(getQueue(), global, local), *out.data, out.info, *signal.data, signal.info, *mBuff, blk_x, blk_y); diff --git a/src/backend/opencl/kernel/fast.cl b/src/backend/opencl/kernel/fast.cl index ef80350f01..48d628fa2b 100644 --- a/src/backend/opencl/kernel/fast.cl +++ b/src/backend/opencl/kernel/fast.cl @@ -129,17 +129,18 @@ void load_shared_image(global const T* in, KParam iInfo, unsigned lx, unsigned ly) { // Copy an image patch to shared memory, with a 3-pixel edge if (ix < lx && iy < ly && x - 3 < iInfo.dims[0] && y - 3 < iInfo.dims[1]) { + in += iInfo.offset; local_image[(ix) + (bx + 6) * (iy)] = - in[(x - 3) + iInfo.dims[0] * (y - 3)]; + in[(x - 3) * iInfo.strides[0] + (y - 3) * iInfo.strides[1]]; if (x + lx - 3 < iInfo.dims[0]) local_image[(ix + lx) + (bx + 6) * (iy)] = - in[(x + lx - 3) + iInfo.dims[0] * (y - 3)]; + in[(x + lx -3) * iInfo.strides[0] + (y - 3) * iInfo.strides[1]]; if (y + ly - 3 < iInfo.dims[1]) local_image[(ix) + (bx + 6) * (iy + ly)] = - in[(x - 3) + iInfo.dims[0] * (y + ly - 3)]; + in[(x - 3) * iInfo.strides[0] + (y + ly - 3) * iInfo.strides[1]]; if (x + lx - 3 < iInfo.dims[0] && y + ly - 3 < iInfo.dims[1]) local_image[(ix + lx) + (bx + 6) * (iy + ly)] = - in[(x + lx - 3) + iInfo.dims[0] * (y + ly - 3)]; + in[(x + lx - 3) * iInfo.strides[0] + (y + ly - 3) * iInfo.strides[1]]; } } @@ -155,7 +156,7 @@ kernel void locate_features(global const T* in, KParam iInfo, unsigned lx = bx / 2 + 3; unsigned ly = by / 2 + 3; - load_shared_image(in + iInfo.offset, iInfo, local_image, ix, iy, bx, by, x, + load_shared_image(in, iInfo, local_image, ix, iy, bx, by, x, y, lx, ly); barrier(CLK_LOCAL_MEM_FENCE); locate_features_core(local_image, score, iInfo, thr, x, y, edge); diff --git a/src/backend/opencl/kernel/harris.hpp b/src/backend/opencl/kernel/harris.hpp index 835c20c745..e1e0ee5186 100644 --- a/src/backend/opencl/kernel/harris.hpp +++ b/src/backend/opencl/kernel/harris.hpp @@ -126,21 +126,20 @@ void harris(unsigned *corners_out, Param &x_out, Param &y_out, Param &resp_out, // Second order-derivatives kernel sizes const unsigned blk_x_so = - divup(in.info.dims[3] * in.info.strides[3], HARRIS_THREADS_PER_GROUP); + divup(in.info.dims[0] * in.info.dims[1], HARRIS_THREADS_PER_GROUP); const NDRange local_so(HARRIS_THREADS_PER_GROUP, 1); const NDRange global_so(blk_x_so * HARRIS_THREADS_PER_GROUP, 1); // Compute second-order derivatives soOp(EnqueueArgs(getQueue(), global_so, local_so), *ixx.get(), *ixy.get(), - *iyy.get(), in.info.dims[3] * in.info.strides[3], *ix.get(), - *iy.get()); + *iyy.get(), in.info.dims[0] * in.info.dims[1], *ix.get(), *iy.get()); CL_DEBUG_FINISH(getQueue()); // Convolve second order derivatives with proper window filter conv_helper(ixx, ixy, iyy, filter); cl::Buffer *d_responses = - bufferAlloc(in.info.dims[3] * in.info.strides[3] * sizeof(T)); + bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(T)); // Harris responses kernel sizes unsigned blk_x_hr = @@ -159,7 +158,7 @@ void harris(unsigned *corners_out, Param &x_out, Param &y_out, Param &resp_out, // Number of corners is not known a priori, limit maximum number of corners // according to image dimensions - unsigned corner_lim = in.info.dims[3] * in.info.strides[3] * 0.2f; + unsigned corner_lim = in.info.dims[0] * in.info.dims[1] * 0.2f; unsigned corners_found = 0; cl::Buffer *d_corners_found = bufferAlloc(sizeof(unsigned)); @@ -221,7 +220,8 @@ void harris(unsigned *corners_out, Param &x_out, Param &y_out, Param &resp_out, harris_idx.info.dims[k - 1] * harris_idx.info.strides[k - 1]; } - int sort_elem = harris_resp.info.strides[3] * harris_resp.info.dims[3]; + int sort_elem = harris_resp.info.dims[0] * harris_resp.info.dims[1]; + harris_resp.data = d_resp_corners; // Create indices using range harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned)); diff --git a/src/backend/opencl/kernel/orb.cl b/src/backend/opencl/kernel/orb.cl index d8a31c81ec..be60db3b7b 100644 --- a/src/backend/opencl/kernel/orb.cl +++ b/src/backend/opencl/kernel/orb.cl @@ -128,6 +128,7 @@ kernel void harris_response( local float data[BLOCK_SIZE * BLOCK_SIZE]; unsigned f = get_global_id(0); + image += iInfo.offset; unsigned x, y; float ixx = 0.f, iyy = 0.f, ixy = 0.f; @@ -155,10 +156,10 @@ kernel void harris_response( int j = k % block_size - r; // Calculate local x and y derivatives - float ix = image[(x + i + 1) * iInfo.dims[0] + y + j] - - image[(x + i - 1) * iInfo.dims[0] + y + j]; - float iy = image[(x + i) * iInfo.dims[0] + y + j + 1] - - image[(x + i) * iInfo.dims[0] + y + j - 1]; + float ix = image[(x + i + 1) * iInfo.strides[1] + (y + j) * iInfo.strides[0]] - + image[(x + i - 1) * iInfo.strides[1] + (y + j) * iInfo.strides[0]] ; + float iy = image[(x + i) * iInfo.strides[1] + (y + j + 1) * iInfo.strides[0]] - + image[(x + i) * iInfo.strides[1] + (y + j - 1) * iInfo.strides[0]]; // Accumulate second order derivatives ixx += ix * ix; @@ -219,7 +220,7 @@ kernel void centroid_angle(global const float* x_in, int j = k % patch_size - r; // Calculate first order moments - T p = image[(x + i) * iInfo.dims[0] + y + j]; + T p = image[(x + i) * iInfo.strides[1] + (y + j) * iInfo.strides[0] + iInfo.offset]; m01 += j * p; m10 += i * p; } @@ -246,7 +247,7 @@ inline T get_pixel(unsigned x, unsigned y, const float ori, const unsigned size, x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin); y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos); - return image[x * iInfo.dims[0] + y]; + return image[x * iInfo.strides[1] + y * iInfo.strides[0] + iInfo.offset]; } kernel void extract_orb(global unsigned* desc_out, const unsigned n_feat, diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp index 5d4f523f16..63debc6df4 100644 --- a/src/backend/opencl/kernel/orb.hpp +++ b/src/backend/opencl/kernel/orb.hpp @@ -327,13 +327,14 @@ void orb(unsigned* out_feat, Param& x_out, Param& y_out, Param& score_out, Param lvl_tmp; if (blur_img) { - lvl_filt = lvl_img; - lvl_tmp = lvl_img; - - lvl_filt.data = bufferAlloc(lvl_filt.info.dims[0] * - lvl_filt.info.dims[1] * sizeof(T)); - lvl_tmp.data = bufferAlloc(lvl_tmp.info.dims[0] * - lvl_tmp.info.dims[1] * sizeof(T)); + const dim_t pixels = lvl_img.info.dims[0] * lvl_img.info.dims[1]; + lvl_filt.info = {{lvl_img.info.dims[0], lvl_img.info.dims[1], 1, 1}, + {1, lvl_img.info.dims[0], pixels, pixels}, + 0}; + lvl_filt.data = bufferAlloc(pixels * sizeof(T)); + + lvl_tmp.info = lvl_filt.info; + lvl_tmp.data = bufferAlloc(pixels * sizeof(T)); // Calculate a separable Gaussian kernel if (h_gauss == nullptr) { diff --git a/test/arrayfire_test.cpp b/test/arrayfire_test.cpp index dedaedbf75..6803cc586d 100644 --- a/test/arrayfire_test.cpp +++ b/test/arrayfire_test.cpp @@ -105,17 +105,16 @@ std::string readNextNonEmptyLine(std::ifstream &file) { std::string getBackendName(bool lower) { af::Backend backend = af::getActiveBackend(); - switch(backend) { - case AF_BACKEND_CPU: - return lower ? std::string("cpu") : std::string("CPU"); - case AF_BACKEND_CUDA: - return lower ? std::string("cuda") : std::string("CUDA"); - case AF_BACKEND_OPENCL: - return lower ? std::string("opencl") : std::string("OpenCL"); - case AF_BACKEND_ONEAPI: - return lower ? std::string("oneapi") : std::string("oneAPI"); - default: - return lower ? std::string("unknown") : std::string("Unknown"); + switch (backend) { + case AF_BACKEND_CPU: + return lower ? std::string("cpu") : std::string("CPU"); + case AF_BACKEND_CUDA: + return lower ? std::string("cuda") : std::string("CUDA"); + case AF_BACKEND_OPENCL: + return lower ? std::string("opencl") : std::string("OpenCL"); + case AF_BACKEND_ONEAPI: + return lower ? std::string("oneapi") : std::string("oneAPI"); + default: return lower ? std::string("unknown") : std::string("Unknown"); } } @@ -2046,6 +2045,163 @@ INSTANTIATE(std::complex); INSTANTIATE(std::complex); #undef INSTANTIATE +af::array toTempFormat(tempFormat form, const af::array &in) { + af::array ret; + const af::dim4 &dims = in.dims(); + switch (form) { + case JIT_FORMAT: + switch (in.type()) { + case b8: ret = not(in); break; + default: ret = in * 2; + } + // Make sure that the base array is <> form original + ret.eval(); + switch (in.type()) { + case b8: ret = not(ret); break; + default: ret /= 2; + } + break; + case SUB_FORMAT_dim0: { + af::dim4 pdims(dims); + pdims[0] += 2; + af::array parent = af::randu(pdims, in.type()); + parent(af::seq(1, dims[0]), af::span, af::span, af::span) = in; + ret = parent(af::seq(1, dims[0]), af::span, af::span, af::span); + }; break; + case SUB_FORMAT_dim1: { + af::dim4 pdims(dims); + pdims[1] += 2; + af::array parent = af::randu(pdims, in.type()); + parent(af::span, af::seq(1, dims[1]), af::span, af::span) = in; + ret = parent(af::span, af::seq(1, dims[1]), af::span, af::span); + }; break; + case SUB_FORMAT_dim2: { + af::dim4 pdims(dims); + pdims[2] += 2; + af::array parent = af::randu(pdims, in.type()); + parent(af::span, af::span, af::seq(1, dims[2]), af::span) = in; + ret = parent(af::span, af::span, af::seq(1, dims[2]), af::span); + }; break; + case SUB_FORMAT_dim3: { + af::dim4 pdims(dims); + pdims[3] += 2; + af::array parent = af::randu(pdims, in.type()); + parent(af::span, af::span, af::span, af::seq(1, dims[3])) = in; + ret = parent(af::span, af::span, af::span, af::seq(1, dims[3])); + }; break; + case REORDERED_FORMAT: { + const dim_t idxs[4] = {0, 3, 1, 2}; + // idxs[0] has to be 0, to keep the same data in mem + dim_t rev_idxs[4]; + for (dim_t i = 0; i < 4; ++i) { rev_idxs[idxs[i]] = i; }; + ret = af::reorder(in, idxs[0], idxs[1], idxs[2], idxs[3]); + ret = ret.copy(); // make data linear + ret = af::reorder(ret, rev_idxs[0], rev_idxs[1], rev_idxs[2], + rev_idxs[3]); + // ret has same content as in, although data is stored in + // different order + }; break; + case LINEAR_FORMAT: + default: ret = in.copy(); + }; + return ret; +} + +void toTempFormat(tempFormat form, af_array *out, const af_array &in) { + dim_t dims[4]; + af_get_dims(dims, dims + 1, dims + 2, dims + 3, in); + unsigned numdims; + af_get_numdims(&numdims, in); + af_dtype ty; + af_get_type(&ty, in); + switch (form) { + case JIT_FORMAT: { + // af_array one = nullptr, min_one = nullptr, res = nullptr; + af_array res = nullptr, two = nullptr; + ASSERT_SUCCESS(af_constant(&two, 2, numdims, dims, ty)); + switch (ty) { + case b8: af_not(&res, in); break; + default: + // ret = in + af::constant(1, dims, in.type()); + ASSERT_SUCCESS(af_mul(&res, in, two, false)); + } + // Make sure that the base array is <> form original + ASSERT_SUCCESS(af_eval(res)); + switch (ty) { + case b8: af_not(out, res); break; + default: + ASSERT_SUCCESS(af_div(out, res, two, false)); // NO EVAL!! + } + ASSERT_SUCCESS(af_release_array(two)); + two = nullptr; + ASSERT_SUCCESS(af_release_array(res)); + res = nullptr; + }; break; + case SUB_FORMAT_dim0: { + const dim_t pdims[4] = {dims[0] + 2, dims[1], dims[2], dims[3]}; + af_array parent = nullptr; + ASSERT_SUCCESS(af_randu(&parent, std::max(1u, numdims), pdims, ty)); + const af_seq idxs[4] = {af_make_seq(1, dims[0], 1), af_span, + af_span, af_span}; + + ASSERT_SUCCESS(af_assign_seq(out, parent, numdims, idxs, in)); + ASSERT_SUCCESS(af_index(out, parent, numdims, idxs)); + ASSERT_SUCCESS(af_release_array(parent)); + }; break; + case SUB_FORMAT_dim1: { + const dim_t pdims[4] = {dims[0], dims[1] + 2, dims[2], dims[3]}; + af_array parent = nullptr; + ASSERT_SUCCESS(af_randu(&parent, std::max(2u, numdims), pdims, ty)); + const af_seq idxs[4] = {af_span, af_make_seq(1, dims[1], 1), + af_span, af_span}; + ASSERT_SUCCESS(af_assign_seq(out, parent, numdims, idxs, in)); + ASSERT_SUCCESS(af_index(out, parent, numdims, idxs)); + ASSERT_SUCCESS(af_release_array(parent)); + parent = nullptr; + }; break; + case SUB_FORMAT_dim2: { + const dim_t pdims[4] = {dims[0], dims[1], dims[2] + 2, dims[3]}; + af_array parent = nullptr; + ASSERT_SUCCESS(af_randu(&parent, std::max(3u, numdims), pdims, ty)); + const af_seq idxs[4] = {af_span, af_span, + af_make_seq(1, dims[2], 1), af_span}; + ASSERT_SUCCESS(af_assign_seq(out, parent, numdims, idxs, in)); + ASSERT_SUCCESS(af_index(out, parent, numdims, idxs)); + ASSERT_SUCCESS(af_release_array(parent)); + parent = nullptr; + }; break; + case SUB_FORMAT_dim3: { + const dim_t pdims[4] = {dims[0], dims[1], dims[2], dims[3] + 2}; + af_array parent = nullptr; + ASSERT_SUCCESS(af_randu(&parent, std::max(4u, numdims), pdims, ty)); + const af_seq idxs[4] = {af_span, af_span, af_span, + af_make_seq(1, dims[3], 1)}; + ASSERT_SUCCESS(af_assign_seq(out, parent, numdims, idxs, in)); + ASSERT_SUCCESS(af_index(out, parent, numdims, idxs)); + ASSERT_SUCCESS(af_release_array(parent)); + parent = nullptr; + }; break; + case REORDERED_FORMAT: { + const unsigned idxs[4] = {0, 3, 1, 2}; + // idxs[0] has to be 0, to keep the same data in mem + dim_t rev_idxs[4]; + for (dim_t i = 0; i < 4; ++i) { rev_idxs[idxs[i]] = i; }; + af_array rev = nullptr; + ASSERT_SUCCESS( + af_reorder(&rev, in, idxs[0], idxs[1], idxs[2], idxs[3])); + ASSERT_SUCCESS(af_copy_array(out, rev)); + ASSERT_SUCCESS(af_reorder(out, rev, rev_idxs[0], rev_idxs[1], + rev_idxs[2], rev_idxs[3])); + // ret has same content as in, although data is stored in + // different order + ASSERT_SUCCESS(af_release_array(rev)); + rev = nullptr; + }; break; + case LINEAR_FORMAT: + default: af_copy_array(out, in); + }; +} + int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/test/convolve.cpp b/test/convolve.cpp index 5df8961e1b..2a9c733d59 100644 --- a/test/convolve.cpp +++ b/test/convolve.cpp @@ -1178,8 +1178,86 @@ TEST(ConvolveNN, ZeroPadding_Issue2817) { true); array incoming_gradient = constant(1 / 9.f, 3, 3); - array convolved_grad = convolve2GradientNN(incoming_gradient, signal, filter, - convolved, strides, padding, dilation, - AF_CONV_GRADIENT_FILTER); + array convolved_grad = convolve2GradientNN( + incoming_gradient, signal, filter, convolved, strides, padding, + dilation, AF_CONV_GRADIENT_FILTER); ASSERT_EQ(sum(abs(convolved - convolved_grad)) < 1E-5, true); } + +#define TESTS_TEMP_FORMATS(form) \ + TEST(TEMP_FORMAT, form##_1) { \ + vector numDims; \ + vector> in; \ + vector> tests; \ + \ + readTests( \ + string(TEST_DIR "/convolve/vector_same.test"), numDims, in, \ + tests); \ + array signal(numDims[0], &(in[0].front())); \ + array filter(numDims[1], &(in[1].front())); \ + \ + array out = convolve1(toTempFormat(form, signal), \ + toTempFormat(form, filter), AF_CONV_DEFAULT); \ + array gold = convolve1(signal, filter, AF_CONV_DEFAULT); \ + \ + EXPECT_ARRAYS_EQ(out, gold); \ + } \ + \ + TEST(TEMP_FORMAT, form##_2) { \ + vector numDims; \ + vector> in; \ + vector> tests; \ + \ + readTests( \ + string(TEST_DIR "/convolve/rectangle_same_one2many.test"), \ + numDims, in, tests); \ + array signal(numDims[0], &(in[0].front())); \ + array filter(numDims[1], &(in[1].front())); \ + \ + array out = convolve2(toTempFormat(form, signal), \ + toTempFormat(form, filter), AF_CONV_DEFAULT); \ + array gold = convolve2(signal, filter, AF_CONV_DEFAULT); \ + \ + EXPECT_ARRAYS_EQ(out, gold); \ + } \ + \ + TEST(TEMP_FORMAT, form##_3) { \ + vector numDims; \ + vector> in; \ + vector> tests; \ + \ + readTests( \ + string(TEST_DIR "/convolve/cuboid_same_many2many.test"), numDims, \ + in, tests); \ + array signal(numDims[0], &(in[0].front())); \ + array filter(numDims[1], &(in[1].front())); \ + \ + array out = convolve3(toTempFormat(form, signal), \ + toTempFormat(form, filter), AF_CONV_DEFAULT); \ + array gold = convolve3(signal, filter, AF_CONV_DEFAULT); \ + \ + EXPECT_ARRAYS_EQ(out, gold); \ + } \ + \ + TEST(TEMP_FORMAT, form##_separable) { \ + vector numDims; \ + vector> in; \ + vector> tests; \ + \ + readTests( \ + string(TEST_DIR \ + "/convolve/separable_conv2d_same_rectangle_batch.test"), \ + numDims, in, tests); \ + array signal(numDims[0], &(in[0].front())); \ + array cFilter(numDims[1], &(in[1].front())); \ + array rFilter(numDims[2], &(in[2].front())); \ + \ + array out = \ + convolve(toTempFormat(form, cFilter), toTempFormat(form, rFilter), \ + toTempFormat(form, signal), AF_CONV_DEFAULT); \ + array gold = convolve(cFilter, rFilter, signal, AF_CONV_DEFAULT); \ + \ + EXPECT_ARRAYS_EQ(out, gold); \ + } + +FOREACH_TEMP_FORMAT(TESTS_TEMP_FORMATS) diff --git a/test/fast.cpp b/test/fast.cpp index c5e3225d0e..9114d98b7f 100644 --- a/test/fast.cpp +++ b/test/fast.cpp @@ -239,3 +239,41 @@ TEST(FloatFAST, CPP) { delete[] outOrientation; delete[] outSize; } + +#define TESTS_TEMP_FORMAT(form) \ + TEST(TEMP_FORMAT, form) { \ + UNSUPPORTED_BACKEND(AF_BACKEND_ONEAPI); \ + IMAGEIO_ENABLED_CHECK(); \ + \ + vector inDims; \ + vector inFiles; \ + vector> gold; \ + \ + readImageTests(string(TEST_DIR "/fast/square_nonmax_float.test"), \ + inDims, inFiles, gold); \ + inFiles[0].insert(0, string(TEST_DIR "/fast/")); \ + array in = loadImage(inFiles[0].c_str(), false); \ + \ + features feat = \ + fast(toTempFormat(form, in), 20.0f, 9, true, 0.05f, 3); \ + features gfeat = fast(in, 20.0f, 9, true, 0.05f, 3); \ + \ + /* The results from fast are dependent on threads runtime, so sort by \ + * very simple hash on all columns of feat before comparing */ \ + array score = (feat.getX() * inDims[0].dims[1] + feat.getY()); \ + array idx, score_sorted; \ + sort(score_sorted, idx, score); \ + \ + array gscore = (gfeat.getX() * inDims[0].dims[1] + gfeat.getY()); \ + array gidx, gscore_sorted; \ + sort(gscore_sorted, gidx, gscore); \ + \ + EXPECT_ARRAYS_EQ(feat.getX()(idx), gfeat.getX()(gidx)); \ + EXPECT_ARRAYS_EQ(feat.getY()(idx), gfeat.getY()(gidx)); \ + EXPECT_ARRAYS_EQ(feat.getScore()(idx), gfeat.getScore()(gidx)); \ + EXPECT_ARRAYS_EQ(feat.getOrientation()(idx), \ + gfeat.getOrientation()(gidx)); \ + EXPECT_ARRAYS_EQ(feat.getSize()(idx), gfeat.getSize()(gidx)); \ + } + +FOREACH_TEMP_FORMAT(TESTS_TEMP_FORMAT) diff --git a/test/harris.cpp b/test/harris.cpp index f2fd27d47a..3420edbba4 100644 --- a/test/harris.cpp +++ b/test/harris.cpp @@ -220,3 +220,43 @@ TEST(FloatHarris, CPP) { << "at: " << elIter << endl; } } + +#define TESTS_TEMP_FORMAT(form) \ + TEST(TEMP_FORMAT, form) { \ + UNSUPPORTED_BACKEND(AF_BACKEND_ONEAPI); \ + IMAGEIO_ENABLED_CHECK(); \ + \ + constexpr int MAX_CORNERS = 500; \ + \ + vector inDims; \ + vector inFiles; \ + vector> gold; \ + \ + readImageTests(string(TEST_DIR "/harris/square_0_3.test"), inDims, \ + inFiles, gold); \ + inFiles[0].insert(0, string(TEST_DIR "/harris/")); \ + array in = loadImage(inFiles[0].c_str(), false); \ + \ + features out = \ + harris(toTempFormat(form, in), MAX_CORNERS, 1e5f, 0.0f, 3, 0.04f); \ + features gout = harris(in, MAX_CORNERS, 1e5f, 0.0f, 3, 0.04f); \ + \ + ASSERT_GT(MAX_CORNERS, out.getNumFeatures()); \ + \ + array score = out.getX() * in.dims().dims[1] + out.getY(); \ + array idx, score_sorted; \ + sort(score_sorted, idx, score); \ + \ + array gscore = gout.getX() * in.dims().dims[1] + gout.getY(); \ + array gidx, gscore_sorted; \ + sort(gscore_sorted, gidx, gscore); \ + \ + EXPECT_ARRAYS_EQ(out.getX()(idx), gout.getX()(gidx)); \ + EXPECT_ARRAYS_EQ(out.getY()(idx), gout.getY()(gidx)); \ + EXPECT_ARRAYS_EQ(out.getOrientation()(idx), \ + gout.getOrientation()(gidx)); \ + EXPECT_ARRAYS_EQ(out.getScore()(idx), gout.getScore()(gidx)); \ + EXPECT_ARRAYS_EQ(out.getSize()(idx), gout.getSize()(gidx)); \ + }; + +FOREACH_TEMP_FORMAT(TESTS_TEMP_FORMAT) \ No newline at end of file diff --git a/test/orb.cpp b/test/orb.cpp index 3ace1f4b05..25a4d13eee 100644 --- a/test/orb.cpp +++ b/test/orb.cpp @@ -326,3 +326,59 @@ TEST(ORB, CPP) { delete[] outSize; delete[] outDesc; } + +#define TEST_TEMP_FORMATS(form) \ + TEST(TEMP_FORMAT, form) { \ + UNSUPPORTED_BACKEND(AF_BACKEND_ONEAPI); \ + IMAGEIO_ENABLED_CHECK(); \ + \ + constexpr size_t MAX_FEATURES = 400; \ + \ + vector inDims; \ + vector inFiles; \ + vector> goldFeat; \ + vector> goldDesc; \ + \ + readImageFeaturesDescriptors( \ + string(TEST_DIR "/orb/square.test"), inDims, inFiles, goldFeat, \ + goldDesc); \ + inFiles[0].insert(0, string(TEST_DIR "/orb/")); \ + array in = loadImage(inFiles[0].c_str(), false); \ + \ + features feat; \ + array desc; \ + orb(feat, desc, toTempFormat(form, in), 20.0f, MAX_FEATURES, 1.2f, 8, \ + true); \ + features gfeat; \ + array gdesc; \ + orb(gfeat, gdesc, in, 20.0f, MAX_FEATURES, 1.2f, 8, true); \ + \ + /* The clipping of the features is dependent on threads runtime, so \ + * capture them all. */ \ + ASSERT_GT(MAX_FEATURES, feat.getNumFeatures()) \ + << "Please increase MAX_FEATURES to capture all features"; \ + \ + /* The results from orb are dependent on threads runtime, so sort \ + * by very simple hash on all columns of feat before comparing */ \ + array score = (feat.getX() * inDims[0].dims[1] + feat.getY()) * \ + feat.getScore() * feat.getOrientation() * \ + feat.getSize(); \ + array idx, score_sorted; \ + sort(score_sorted, idx, score); \ + \ + array gscore = (gfeat.getX() * inDims[0].dims[1] + gfeat.getY()) * \ + gfeat.getScore() * gfeat.getOrientation() * \ + gfeat.getSize(); \ + array gidx, gscore_sorted; \ + sort(gscore_sorted, gidx, gscore); \ + \ + EXPECT_ARRAYS_EQ(feat.getX()(idx), gfeat.getX()(gidx)); \ + EXPECT_ARRAYS_EQ(feat.getY()(idx), gfeat.getY()(gidx)); \ + EXPECT_ARRAYS_EQ(feat.getScore()(idx), gfeat.getScore()(gidx)); \ + EXPECT_ARRAYS_EQ(feat.getOrientation()(idx), \ + gfeat.getOrientation()(gidx)); \ + EXPECT_ARRAYS_EQ(feat.getSize()(idx), gfeat.getSize()(gidx)); \ + EXPECT_ARRAYS_EQ(desc(af::span, idx), gdesc(af::span, gidx)); \ + } + +FOREACH_TEMP_FORMAT(TEST_TEMP_FORMATS) diff --git a/test/testHelpers.hpp b/test/testHelpers.hpp index 5f6b02b5a4..405f23309d 100644 --- a/test/testHelpers.hpp +++ b/test/testHelpers.hpp @@ -244,10 +244,10 @@ bool noHalfTests(af::dtype ty); GTEST_SKIP() << "Device doesn't support Half" #ifdef SKIP_UNSUPPORTED_TESTS -#define UNSUPPORTED_BACKEND(backend) \ - if(backend == af::getActiveBackend()) \ - GTEST_SKIP() << "Skipping unsupported function on " \ - + getBackendName() + " backend" +#define UNSUPPORTED_BACKEND(backend) \ + if (backend == af::getActiveBackend()) \ + GTEST_SKIP() << "Skipping unsupported function on " + getBackendName() + \ + " backend" #else #define UNSUPPORTED_BACKEND(backend) #endif @@ -653,6 +653,30 @@ ::testing::AssertionResult assertArrayEq(std::string aName, std::string bName, const af_array a, const af_array b, TestOutputArrayInfo *metadata); +enum tempFormat { + LINEAR_FORMAT, // Linear array (= default) + JIT_FORMAT, // Array which has JIT operations outstanding + SUB_FORMAT_dim0, // Array where only a subset is allocated for dim0 + SUB_FORMAT_dim1, // Array where only a subset is allocated for dim1 + SUB_FORMAT_dim2, // Array where only a subset is allocated for dim2 + SUB_FORMAT_dim3, // Array where only a subset is allocated for dim3 + REORDERED_FORMAT // Array where the dimensions are reordered +}; +// Calls the function fn for all available formats +#define FOREACH_TEMP_FORMAT(TESTS) \ + TESTS(LINEAR_FORMAT) \ + TESTS(JIT_FORMAT) \ + TESTS(SUB_FORMAT_dim0) \ + TESTS(SUB_FORMAT_dim1) \ + TESTS(SUB_FORMAT_dim2) \ + TESTS(SUB_FORMAT_dim3) \ + TESTS(REORDERED_FORMAT) + +// formats the "in" array according to provided format. The content remains +// unchanged. +af::array toTempFormat(tempFormat form, const af::array &in); +void toTempFormat(tempFormat form, af_array *out, const af_array &in); + #ifdef __GNUC__ #pragma GCC diagnostic pop #endif