Skip to content

Fixes sub array (opencl, cuda, cpu, oneapi) for orb #3670

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions src/backend/common/KernelInterface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,51 @@ class KernelInterface {
virtual void copyToReadOnly(DevPtrType dst, DevPtrType src,
size_t bytes) = 0;

/// \brief Copy data from device memory to read-only memory
///
/// This function copies data of `bytes` size from the device pointer to a
/// read-only memory.
///
/// \param[in] dst is the device pointer to which data will be copied
/// \param[in] src is the device pointer from which data will be copied
/// \param[in] srcXInBytes is offset in Bytes
/// \param[in] bytes are the number of bytes of data to be copied
virtual void copyToReadOnly(DevPtrType dst, DevPtrType src,
size_t srcXInBytes, size_t bytes) = 0;

/// \brief Copy strided 2D data from device memory to read-only memory
///
/// This function copies data of any 2D array from the device pointer to a
/// read-only memory.
///
/// \param[in] dst is the device pointer to which data will be copied
/// \param[in] src is the device pointer from which data will be copied
/// \param[in] srcXInBytes is offset in Bytes
/// \param[in] srcPitchInBytes is strides[1] in Bytes
/// \param[in] height is the number of elements for dim[1] dst
/// \param[in] widthInBytes are #bytes of continous data to copy (dim[0])
virtual void copyToReadOnly2D(DevPtrType dst, DevPtrType src,
size_t srcXInBytes, size_t srcPitchInBytes,
size_t height, size_t widthInBytes) = 0;

/// \brief Copy strided 3D data from device memory to read-only memory
///
/// This function copies data of any 3D array from the device pointer to a
/// read-only memory.
///
/// \param[in] dst is the device pointer to which data will be copied
/// \param[in] src is the device pointer from which data will be copied
/// \param[in] srcXInBytes is offset in Bytes
/// \param[in] srcPitchInBytes is strides[1] in Bytes
/// \param[in] srcHeight is the number of elements ALLOCATED for dim[1] src
/// \param[in] depth is the number of elements for dim[2] dst
/// \param[in] height is the number of elements for dim[1] dst
/// \param[in] widthInBytes are #bytes of continous data to copy (dim[0])
virtual void copyToReadOnly3D(DevPtrType dst, DevPtrType src,
size_t srcXInBytes, size_t srcPitchInBytes,
size_t srcHeight, size_t depth, size_t height,
size_t widthInBytes) = 0;

/// \brief Copy a single scalar to device memory
///
/// This function copies a single value of type T from host variable
Expand Down
17 changes: 10 additions & 7 deletions src/backend/cpu/kernel/convolve.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ void one2one_3d(InT *optr, InT const *const iptr, AccT const *const fptr,
}
optr[koff + joff + i - iStart] = InT(accum);
} // i loop ends here
} // j loop ends here
} // k loop ends here
} // j loop ends here
} // k loop ends here
}

template<typename InT, typename AccT>
Expand Down Expand Up @@ -217,7 +217,6 @@ void convolve2_separable(InT *optr, InT const *const iptr,
dim_t fDim, af::dim4 const &oStrides,
af::dim4 const &sStrides, dim_t fStride) {
UNUSED(orgDims);
UNUSED(sStrides);
UNUSED(fStride);
for (dim_t j = 0; j < oDims[1]; ++j) {
dim_t jOff = j * oStrides[1];
Expand All @@ -237,14 +236,18 @@ void convolve2_separable(InT *optr, InT const *const iptr,
dim_t offi = ci - f;
bool isCIValid = offi >= 0 && offi < sDims[0];
bool isCJValid = cj >= 0 && cj < sDims[1];
s_val = (isCJValid && isCIValid ? iptr[cj * sDims[0] + offi]
: scalar<InT>(0));
s_val =
(isCJValid && isCIValid ? iptr[cj * sStrides.dims[1] +
offi * sStrides.dims[0]]
: scalar<InT>(0));
} else {
dim_t offj = cj - f;
bool isCIValid = ci >= 0 && ci < sDims[0];
bool isCJValid = offj >= 0 && offj < sDims[1];
s_val = (isCJValid && isCIValid ? iptr[offj * sDims[0] + ci]
: scalar<InT>(0));
s_val =
(isCJValid && isCIValid ? iptr[offj * sStrides.dims[1] +
ci * sStrides.dims[0]]
: scalar<InT>(0));
}

accum += AccT(s_val * f_val);
Expand Down
88 changes: 55 additions & 33 deletions src/backend/cpu/kernel/fast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ inline int idx_x(int i) {
return idx_y(i - 12);
}

inline int idx(int y, int x, unsigned idim0) { return x * idim0 + y; }
inline int idx(int y, int x, unsigned istrides0, unsigned istrides1) {
return x * istrides1 + y * istrides0;
}

// test_greater()
// Tests if a pixel x > p + thr
Expand All @@ -44,9 +46,10 @@ inline int test_smaller(float x, float p, float thr) { return (x < p - thr); }
// Returns 1 when x > p + thr
template<typename T>
inline int test_pixel(const T *image, const float p, float thr, int y, int x,
unsigned idim0) {
return -test_smaller((float)image[idx(y, x, idim0)], p, thr) +
test_greater((float)image[idx(y, x, idim0)], p, thr);
unsigned istrides0, unsigned istrides1) {
return -test_smaller((float)image[idx(y, x, istrides0, istrides1)], p,
thr) +
test_greater((float)image[idx(y, x, istrides0, istrides1)], p, thr);
}

// abs_diff()
Expand All @@ -64,44 +67,61 @@ void locate_features(CParam<T> in, Param<float> score, Param<float> x_out,
unsigned *count, float const thr,
unsigned const arc_length, unsigned const nonmax,
unsigned const max_feat, unsigned const edge) {
af::dim4 in_dims = in.dims();
T const *in_ptr = in.get();
af::dim4 in_dims = in.dims();
af::dim4 in_strides = in.strides();
T const *in_ptr = in.get();

for (int y = edge; y < (int)(in_dims[0] - edge); y++) {
for (int x = edge; x < (int)(in_dims[1] - edge); x++) {
float p = in_ptr[idx(y, x, in_dims[0])];
float p = in_ptr[idx(y, x, in_strides[0], in_strides[1])];

// Start by testing opposite pixels of the circle that will result
// in a non-kepoint
int d;
d = test_pixel<T>(in_ptr, p, thr, y - 3, x, in_dims[0]) |
test_pixel<T>(in_ptr, p, thr, y + 3, x, in_dims[0]);
d = test_pixel<T>(in_ptr, p, thr, y - 3, x, in_strides[0],
in_strides[1]) |
test_pixel<T>(in_ptr, p, thr, y + 3, x, in_strides[0],
in_strides[1]);
if (d == 0) continue;

d &= test_pixel<T>(in_ptr, p, thr, y - 2, x + 2, in_dims[0]) |
test_pixel<T>(in_ptr, p, thr, y + 2, x - 2, in_dims[0]);
d &= test_pixel<T>(in_ptr, p, thr, y, x + 3, in_dims[0]) |
test_pixel<T>(in_ptr, p, thr, y, x - 3, in_dims[0]);
d &= test_pixel<T>(in_ptr, p, thr, y + 2, x + 2, in_dims[0]) |
test_pixel<T>(in_ptr, p, thr, y - 2, x - 2, in_dims[0]);
d &= test_pixel<T>(in_ptr, p, thr, y - 2, x + 2, in_strides[0],
in_strides[1]) |
test_pixel<T>(in_ptr, p, thr, y + 2, x - 2, in_strides[0],
in_strides[1]);
d &= test_pixel<T>(in_ptr, p, thr, y, x + 3, in_strides[0],
in_strides[1]) |
test_pixel<T>(in_ptr, p, thr, y, x - 3, in_strides[0],
in_strides[1]);
d &= test_pixel<T>(in_ptr, p, thr, y + 2, x + 2, in_strides[0],
in_strides[1]) |
test_pixel<T>(in_ptr, p, thr, y - 2, x - 2, in_strides[0],
in_strides[1]);
if (d == 0) continue;

d &= test_pixel<T>(in_ptr, p, thr, y - 3, x + 1, in_dims[0]) |
test_pixel<T>(in_ptr, p, thr, y + 3, x - 1, in_dims[0]);
d &= test_pixel<T>(in_ptr, p, thr, y - 1, x + 3, in_dims[0]) |
test_pixel<T>(in_ptr, p, thr, y + 1, x - 3, in_dims[0]);
d &= test_pixel<T>(in_ptr, p, thr, y + 1, x + 3, in_dims[0]) |
test_pixel<T>(in_ptr, p, thr, y - 1, x - 3, in_dims[0]);
d &= test_pixel<T>(in_ptr, p, thr, y + 3, x + 1, in_dims[0]) |
test_pixel<T>(in_ptr, p, thr, y - 3, x - 1, in_dims[0]);
d &= test_pixel<T>(in_ptr, p, thr, y - 3, x + 1, in_strides[0],
in_strides[1]) |
test_pixel<T>(in_ptr, p, thr, y + 3, x - 1, in_strides[0],
in_strides[1]);
d &= test_pixel<T>(in_ptr, p, thr, y - 1, x + 3, in_strides[0],
in_strides[1]) |
test_pixel<T>(in_ptr, p, thr, y + 1, x - 3, in_strides[0],
in_strides[1]);
d &= test_pixel<T>(in_ptr, p, thr, y + 1, x + 3, in_strides[0],
in_strides[1]) |
test_pixel<T>(in_ptr, p, thr, y - 1, x - 3, in_strides[0],
in_strides[1]);
d &= test_pixel<T>(in_ptr, p, thr, y + 3, x + 1, in_strides[0],
in_strides[1]) |
test_pixel<T>(in_ptr, p, thr, y - 3, x - 1, in_strides[0],
in_strides[1]);
if (d == 0) continue;

int sum = 0;

// Sum responses [-1, 0 or 1] of first arc_length pixels
for (int i = 0; i < static_cast<int>(arc_length); i++)
sum += test_pixel<T>(in_ptr, p, thr, y + idx_y(i), x + idx_x(i),
in_dims[0]);
in_strides[0], in_strides[1]);

// Test maximum and mininmum responses of first segment of
// arc_length pixels
Expand All @@ -113,29 +133,31 @@ void locate_features(CParam<T> in, Param<float> score, Param<float> x_out,
// circle
for (int i = arc_length; i < 16; i++) {
sum -= test_pixel<T>(in_ptr, p, thr, y + idx_y(i - arc_length),
x + idx_x(i - arc_length), in_dims[0]);
x + idx_x(i - arc_length), in_strides[0],
in_strides[1]);
sum += test_pixel<T>(in_ptr, p, thr, y + idx_y(i), x + idx_x(i),
in_dims[0]);
in_strides[0], in_strides[1]);
max_sum = std::max(max_sum, sum);
min_sum = std::min(min_sum, sum);
}

// To completely test all possible segments, it's necessary to test
// segments that include the top junction of the circle
for (int i = 0; i < static_cast<int>(arc_length - 1); i++) {
sum -= test_pixel<T>(
in_ptr, p, thr, y + idx_y(16 - arc_length + i),
x + idx_x(16 - arc_length + i), in_dims[0]);
sum -= test_pixel<T>(in_ptr, p, thr,
y + idx_y(16 - arc_length + i),
x + idx_x(16 - arc_length + i),
in_strides[0], in_strides[1]);
sum += test_pixel<T>(in_ptr, p, thr, y + idx_y(i), x + idx_x(i),
in_dims[0]);
in_strides[0], in_strides[1]);
max_sum = std::max(max_sum, sum);
min_sum = std::min(min_sum, sum);
}

float s_bright = 0, s_dark = 0;
for (int i = 0; i < 16; i++) {
float p_x =
(float)in_ptr[idx(y + idx_y(i), x + idx_x(i), in_dims[0])];
float p_x = (float)in_ptr[idx(y + idx_y(i), x + idx_x(i),
in_strides[0], in_strides[1])];

s_bright +=
test_greater(p_x, p, thr) * (abs_diff(p_x, p) - thr);
Expand All @@ -159,7 +181,7 @@ void locate_features(CParam<T> in, Param<float> score, Param<float> x_out,
static_cast<float>(std::max(s_bright, s_dark));
if (nonmax == 1) {
float *score_ptr = score.get();
score_ptr[idx(y, x, in_dims[0])] =
score_ptr[idx(y, x, 1, in_dims[0])] =
std::max(s_bright, s_dark);
}
}
Expand Down
34 changes: 19 additions & 15 deletions src/backend/cpu/kernel/orb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,9 @@ void harris_response(float* x_out, float* y_out, float* score_out,
unsigned* usable_feat, CParam<T> image,
const unsigned block_size, const float k_thr,
const unsigned patch_size) {
const af::dim4 idims = image.dims();
const T* image_ptr = image.get();
const af::dim4 idims = image.dims();
const af::dim4 istrides = image.strides();
const T* image_ptr = image.get();
for (unsigned f = 0; f < total_feat; f++) {
unsigned x, y;
float scl = 1.f;
Expand Down Expand Up @@ -154,10 +155,12 @@ void harris_response(float* x_out, float* y_out, float* score_out,
int j = k % block_size - r;

// Calculate local x and y derivatives
float ix = image_ptr[(x + i + 1) * idims[0] + y + j] -
image_ptr[(x + i - 1) * idims[0] + y + j];
float iy = image_ptr[(x + i) * idims[0] + y + j + 1] -
image_ptr[(x + i) * idims[0] + y + j - 1];
float ix =
image_ptr[(x + i + 1) * istrides[1] + (y + j) * istrides[0]] -
image_ptr[(x + i - 1) * istrides[1] + (y + j) * istrides[0]];
float iy =
image_ptr[(x + i) * istrides[1] + (y + j + 1) * istrides[0]] -
image_ptr[(x + i) * istrides[1] + (y + j - 1) * istrides[0]];

// Accumulate second order derivatives
ixx += ix * ix;
Expand Down Expand Up @@ -189,8 +192,9 @@ template<typename T>
void centroid_angle(const float* x_in, const float* y_in,
float* orientation_out, const unsigned total_feat,
CParam<T> image, const unsigned patch_size) {
const af::dim4 idims = image.dims();
const T* image_ptr = image.get();
const af::dim4 idims = image.dims();
const af::dim4 istrides = image.strides();
const T* image_ptr = image.get();
for (unsigned f = 0; f < total_feat; f++) {
unsigned x = (unsigned)round(x_in[f]);
unsigned y = (unsigned)round(y_in[f]);
Expand All @@ -205,7 +209,7 @@ void centroid_angle(const float* x_in, const float* y_in,
int j = k % patch_size - r;

// Calculate first order moments
T p = image_ptr[(x + i) * idims[0] + y + j];
T p = image_ptr[(x + i) * istrides[1] + (y + j) * istrides[0]];
m01 += j * p;
m10 += i * p;
}
Expand All @@ -219,17 +223,17 @@ template<typename T>
inline T get_pixel(unsigned x, unsigned y, const float ori, const unsigned size,
const int dist_x, const int dist_y, CParam<T> image,
const unsigned patch_size) {
const af::dim4 idims = image.dims();
const T* image_ptr = image.get();
float ori_sin = sin(ori);
float ori_cos = cos(ori);
float patch_scl = (float)size / (float)patch_size;
const af::dim4 istrides = image.strides();
const T* image_ptr = image.get();
float ori_sin = sin(ori);
float ori_cos = cos(ori);
float patch_scl = (float)size / (float)patch_size;

// Calculate point coordinates based on orientation and size
x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);

return image_ptr[x * idims[0] + y];
return image_ptr[x * istrides[1] + y * istrides[0]];
}

template<typename T>
Expand Down
61 changes: 61 additions & 0 deletions src/backend/cuda/Kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,67 @@ void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src,
CU_CHECK(cuMemcpyDtoDAsync(dst, src, bytes, getActiveStream()));
}

void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src,
size_t srcXInBytes, size_t bytes) {
CU_CHECK(cuMemcpyDtoDAsync(dst, src, bytes, getActiveStream()));
}

void Kernel::copyToReadOnly2D(Kernel::DevPtrType dst, Kernel::DevPtrType src,
size_t srcXInBytes, size_t srcPitchInBytes,
size_t height, size_t widthInBytes) {
CUDA_MEMCPY2D pCopy;
pCopy.srcXInBytes = srcXInBytes;
pCopy.srcY = 0;
pCopy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
pCopy.srcDevice = src;
pCopy.srcPitch = srcPitchInBytes;

pCopy.dstXInBytes = 0;
pCopy.dstY = 0;
pCopy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
pCopy.dstDevice = dst;
pCopy.dstPitch = widthInBytes;

pCopy.WidthInBytes = widthInBytes;
pCopy.Height = height;
// CUdeviceptr srcStart = srcDevice + srcY*srcPitch + srcXInBytes;
// CUdeviceptr dstStart = dstDevice + dstY*dstPitch + dstXInBytes;

CU_CHECK(cuMemcpy2DAsync(&pCopy, getActiveStream()));
}

void Kernel::copyToReadOnly3D(Kernel::DevPtrType dst, Kernel::DevPtrType src,
size_t srcXInBytes, size_t srcPitchInBytes,
size_t srcHeight, size_t depth, size_t height,
size_t widthInBytes) {
CUDA_MEMCPY3D pCopy;
pCopy.srcXInBytes = srcXInBytes;
pCopy.srcY = 0;
pCopy.srcZ = 0;
pCopy.srcLOD = 0;
pCopy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
pCopy.srcDevice = src;
pCopy.srcPitch = srcPitchInBytes;
pCopy.srcHeight = srcHeight;

pCopy.dstXInBytes = 0;
pCopy.dstY = 0;
pCopy.dstZ = 0;
pCopy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
pCopy.dstDevice = dst;
pCopy.dstPitch = widthInBytes;
pCopy.dstHeight = height;

pCopy.WidthInBytes = widthInBytes;
pCopy.Height = height;
pCopy.Depth = depth;
// CUdeviceptr srcStart =
// srcDevice + (srcZ*srcHeight+srcY)*srcPitch + srcXInBytes;
// CUdeviceptr dstStart =
// dstDevice + (dstZ*dstHeight+dstY)*dstPitch + dstXInBytes;
CU_CHECK(cuMemcpy3DAsync(&pCopy, getActiveStream()));
}

void Kernel::setFlag(Kernel::DevPtrType dst, int* scalarValPtr,
const bool syncCopy) {
CU_CHECK(
Expand Down
Loading