arrayfire · willyborn · Jun 10, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025
diff --git a/src/backend/common/KernelInterface.hpp b/src/backend/common/KernelInterface.hpp
@@ -64,6 +64,51 @@ class KernelInterface {
     virtual void copyToReadOnly(DevPtrType dst, DevPtrType src,
                                 size_t bytes) = 0;
 
+    /// \brief Copy data from device memory to read-only memory
+    ///
+    /// This function copies data of `bytes` size from the device pointer to a
+    /// read-only memory.
+    ///
+    /// \param[in] dst is the device pointer to which data will be copied
+    /// \param[in] src is the device pointer from which data will be copied
+    /// \param[in] srcXInBytes is offset in Bytes
+    /// \param[in] bytes are the number of bytes of data to be copied
+    virtual void copyToReadOnly(DevPtrType dst, DevPtrType src,
+                                size_t srcXInBytes, size_t bytes) = 0;
+
+    /// \brief Copy strided 2D data from device memory to read-only memory
+    ///
+    /// This function copies data of any 2D array from the device pointer to a
+    /// read-only memory.
+    ///
+    /// \param[in] dst is the device pointer to which data will be copied
+    /// \param[in] src is the device pointer from which data will be copied
+    /// \param[in] srcXInBytes is offset in Bytes
+    /// \param[in] srcPitchInBytes is strides[1] in Bytes
+    /// \param[in] height is the number of elements for dim[1] dst
+    /// \param[in] widthInBytes are #bytes of continous data to copy (dim[0])
+    virtual void copyToReadOnly2D(DevPtrType dst, DevPtrType src,
+                                  size_t srcXInBytes, size_t srcPitchInBytes,
+                                  size_t height, size_t widthInBytes) = 0;
+
+    /// \brief Copy strided 3D data from device memory to read-only memory
+    ///
+    /// This function copies data of any 3D array from the device pointer to a
+    /// read-only memory.
+    ///
+    /// \param[in] dst is the device pointer to which data will be copied
+    /// \param[in] src is the device pointer from which data will be copied
+    /// \param[in] srcXInBytes is offset in Bytes
+    /// \param[in] srcPitchInBytes is strides[1] in Bytes
+    /// \param[in] srcHeight is the number of elements ALLOCATED for dim[1] src
+    /// \param[in] depth is the number of elements for dim[2] dst
+    /// \param[in] height is the number of elements for dim[1] dst
+    /// \param[in] widthInBytes are #bytes of continous data to copy (dim[0])
+    virtual void copyToReadOnly3D(DevPtrType dst, DevPtrType src,
+                                  size_t srcXInBytes, size_t srcPitchInBytes,
+                                  size_t srcHeight, size_t depth, size_t height,
+                                  size_t widthInBytes) = 0;
+
     /// \brief Copy a single scalar to device memory
     ///
     /// This function copies a single value of type T from host variable

diff --git a/src/backend/cpu/kernel/convolve.hpp b/src/backend/cpu/kernel/convolve.hpp
@@ -125,8 +125,8 @@ void one2one_3d(InT *optr, InT const *const iptr, AccT const *const fptr,
                 }
                 optr[koff + joff + i - iStart] = InT(accum);
             }  // i loop ends here
-        }      // j loop ends here
-    }          // k loop ends here
+        }  // j loop ends here
+    }  // k loop ends here
 }
 
 template<typename InT, typename AccT>
@@ -217,7 +217,6 @@ void convolve2_separable(InT *optr, InT const *const iptr,
                          dim_t fDim, af::dim4 const &oStrides,
                          af::dim4 const &sStrides, dim_t fStride) {
     UNUSED(orgDims);
-    UNUSED(sStrides);
     UNUSED(fStride);
     for (dim_t j = 0; j < oDims[1]; ++j) {
         dim_t jOff = j * oStrides[1];
@@ -237,14 +236,18 @@ void convolve2_separable(InT *optr, InT const *const iptr,
                     dim_t offi     = ci - f;
                     bool isCIValid = offi >= 0 && offi < sDims[0];
                     bool isCJValid = cj >= 0 && cj < sDims[1];
-                    s_val = (isCJValid && isCIValid ? iptr[cj * sDims[0] + offi]
-                                                    : scalar<InT>(0));
+                    s_val =
+                        (isCJValid && isCIValid ? iptr[cj * sStrides.dims[1] +
+                                                       offi * sStrides.dims[0]]
+                                                : scalar<InT>(0));
                 } else {
                     dim_t offj     = cj - f;
                     bool isCIValid = ci >= 0 && ci < sDims[0];
                     bool isCJValid = offj >= 0 && offj < sDims[1];
-                    s_val = (isCJValid && isCIValid ? iptr[offj * sDims[0] + ci]
-                                                    : scalar<InT>(0));
+                    s_val =
+                        (isCJValid && isCIValid ? iptr[offj * sStrides.dims[1] +
+                                                       ci * sStrides.dims[0]]
+                                                : scalar<InT>(0));
                 }
 
                 accum += AccT(s_val * f_val);

diff --git a/src/backend/cpu/kernel/fast.hpp b/src/backend/cpu/kernel/fast.hpp
@@ -28,7 +28,9 @@ inline int idx_x(int i) {
     return idx_y(i - 12);
 }
 
-inline int idx(int y, int x, unsigned idim0) { return x * idim0 + y; }
+inline int idx(int y, int x, unsigned istrides0, unsigned istrides1) {
+    return x * istrides1 + y * istrides0;
+}
 
 // test_greater()
 // Tests if a pixel x > p + thr
@@ -44,9 +46,10 @@ inline int test_smaller(float x, float p, float thr) { return (x < p - thr); }
 // Returns  1 when x > p + thr
 template<typename T>
 inline int test_pixel(const T *image, const float p, float thr, int y, int x,
-                      unsigned idim0) {
-    return -test_smaller((float)image[idx(y, x, idim0)], p, thr) +
-           test_greater((float)image[idx(y, x, idim0)], p, thr);
+                      unsigned istrides0, unsigned istrides1) {
+    return -test_smaller((float)image[idx(y, x, istrides0, istrides1)], p,
+                         thr) +
+           test_greater((float)image[idx(y, x, istrides0, istrides1)], p, thr);
 }
 
 // abs_diff()
@@ -64,44 +67,61 @@ void locate_features(CParam<T> in, Param<float> score, Param<float> x_out,
                      unsigned *count, float const thr,
                      unsigned const arc_length, unsigned const nonmax,
                      unsigned const max_feat, unsigned const edge) {
-    af::dim4 in_dims = in.dims();
-    T const *in_ptr  = in.get();
+    af::dim4 in_dims    = in.dims();
+    af::dim4 in_strides = in.strides();
+    T const *in_ptr     = in.get();
 
     for (int y = edge; y < (int)(in_dims[0] - edge); y++) {
         for (int x = edge; x < (int)(in_dims[1] - edge); x++) {
-            float p = in_ptr[idx(y, x, in_dims[0])];
+            float p = in_ptr[idx(y, x, in_strides[0], in_strides[1])];
 
             // Start by testing opposite pixels of the circle that will result
             // in a non-kepoint
             int d;
-            d = test_pixel<T>(in_ptr, p, thr, y - 3, x, in_dims[0]) |
-                test_pixel<T>(in_ptr, p, thr, y + 3, x, in_dims[0]);
+            d = test_pixel<T>(in_ptr, p, thr, y - 3, x, in_strides[0],
+                              in_strides[1]) |
+                test_pixel<T>(in_ptr, p, thr, y + 3, x, in_strides[0],
+                              in_strides[1]);
             if (d == 0) continue;
 
-            d &= test_pixel<T>(in_ptr, p, thr, y - 2, x + 2, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y + 2, x - 2, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y, x + 3, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y, x - 3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y + 2, x + 2, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y - 2, x - 2, in_dims[0]);
+            d &= test_pixel<T>(in_ptr, p, thr, y - 2, x + 2, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y + 2, x - 2, in_strides[0],
+                               in_strides[1]);
+            d &= test_pixel<T>(in_ptr, p, thr, y, x + 3, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y, x - 3, in_strides[0],
+                               in_strides[1]);
+            d &= test_pixel<T>(in_ptr, p, thr, y + 2, x + 2, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y - 2, x - 2, in_strides[0],
+                               in_strides[1]);
             if (d == 0) continue;
 
-            d &= test_pixel<T>(in_ptr, p, thr, y - 3, x + 1, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y + 3, x - 1, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y - 1, x + 3, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y + 1, x - 3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y + 1, x + 3, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y - 1, x - 3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y + 3, x + 1, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y - 3, x - 1, in_dims[0]);
+            d &= test_pixel<T>(in_ptr, p, thr, y - 3, x + 1, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y + 3, x - 1, in_strides[0],
+                               in_strides[1]);
+            d &= test_pixel<T>(in_ptr, p, thr, y - 1, x + 3, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y + 1, x - 3, in_strides[0],
+                               in_strides[1]);
+            d &= test_pixel<T>(in_ptr, p, thr, y + 1, x + 3, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y - 1, x - 3, in_strides[0],
+                               in_strides[1]);
+            d &= test_pixel<T>(in_ptr, p, thr, y + 3, x + 1, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y - 3, x - 1, in_strides[0],
+                               in_strides[1]);
             if (d == 0) continue;
 
             int sum = 0;
 
             // Sum responses [-1, 0 or 1] of first arc_length pixels
             for (int i = 0; i < static_cast<int>(arc_length); i++)
                 sum += test_pixel<T>(in_ptr, p, thr, y + idx_y(i), x + idx_x(i),
-                                     in_dims[0]);
+                                     in_strides[0], in_strides[1]);
 
             // Test maximum and mininmum responses of first segment of
             // arc_length pixels
@@ -113,29 +133,31 @@ void locate_features(CParam<T> in, Param<float> score, Param<float> x_out,
             // circle
             for (int i = arc_length; i < 16; i++) {
                 sum -= test_pixel<T>(in_ptr, p, thr, y + idx_y(i - arc_length),
-                                     x + idx_x(i - arc_length), in_dims[0]);
+                                     x + idx_x(i - arc_length), in_strides[0],
+                                     in_strides[1]);
                 sum += test_pixel<T>(in_ptr, p, thr, y + idx_y(i), x + idx_x(i),
-                                     in_dims[0]);
+                                     in_strides[0], in_strides[1]);
                 max_sum = std::max(max_sum, sum);
                 min_sum = std::min(min_sum, sum);
             }
 
             // To completely test all possible segments, it's necessary to test
             // segments that include the top junction of the circle
             for (int i = 0; i < static_cast<int>(arc_length - 1); i++) {
-                sum -= test_pixel<T>(
-                    in_ptr, p, thr, y + idx_y(16 - arc_length + i),
-                    x + idx_x(16 - arc_length + i), in_dims[0]);
+                sum -= test_pixel<T>(in_ptr, p, thr,
+                                     y + idx_y(16 - arc_length + i),
+                                     x + idx_x(16 - arc_length + i),
+                                     in_strides[0], in_strides[1]);
                 sum += test_pixel<T>(in_ptr, p, thr, y + idx_y(i), x + idx_x(i),
-                                     in_dims[0]);
+                                     in_strides[0], in_strides[1]);
                 max_sum = std::max(max_sum, sum);
                 min_sum = std::min(min_sum, sum);
             }
 
             float s_bright = 0, s_dark = 0;
             for (int i = 0; i < 16; i++) {
-                float p_x =
-                    (float)in_ptr[idx(y + idx_y(i), x + idx_x(i), in_dims[0])];
+                float p_x = (float)in_ptr[idx(y + idx_y(i), x + idx_x(i),
+                                              in_strides[0], in_strides[1])];
 
                 s_bright +=
                     test_greater(p_x, p, thr) * (abs_diff(p_x, p) - thr);
@@ -159,7 +181,7 @@ void locate_features(CParam<T> in, Param<float> score, Param<float> x_out,
                         static_cast<float>(std::max(s_bright, s_dark));
                     if (nonmax == 1) {
                         float *score_ptr = score.get();
-                        score_ptr[idx(y, x, in_dims[0])] =
+                        score_ptr[idx(y, x, 1, in_dims[0])] =
                             std::max(s_bright, s_dark);
                     }
                 }

diff --git a/src/backend/cpu/kernel/orb.hpp b/src/backend/cpu/kernel/orb.hpp
@@ -119,8 +119,9 @@ void harris_response(float* x_out, float* y_out, float* score_out,
                      unsigned* usable_feat, CParam<T> image,
                      const unsigned block_size, const float k_thr,
                      const unsigned patch_size) {
-    const af::dim4 idims = image.dims();
-    const T* image_ptr   = image.get();
+    const af::dim4 idims    = image.dims();
+    const af::dim4 istrides = image.strides();
+    const T* image_ptr      = image.get();
     for (unsigned f = 0; f < total_feat; f++) {
         unsigned x, y;
         float scl = 1.f;
@@ -154,10 +155,12 @@ void harris_response(float* x_out, float* y_out, float* score_out,
             int j = k % block_size - r;
 
             // Calculate local x and y derivatives
-            float ix = image_ptr[(x + i + 1) * idims[0] + y + j] -
-                       image_ptr[(x + i - 1) * idims[0] + y + j];
-            float iy = image_ptr[(x + i) * idims[0] + y + j + 1] -
-                       image_ptr[(x + i) * idims[0] + y + j - 1];
+            float ix =
+                image_ptr[(x + i + 1) * istrides[1] + (y + j) * istrides[0]] -
+                image_ptr[(x + i - 1) * istrides[1] + (y + j) * istrides[0]];
+            float iy =
+                image_ptr[(x + i) * istrides[1] + (y + j + 1) * istrides[0]] -
+                image_ptr[(x + i) * istrides[1] + (y + j - 1) * istrides[0]];
 
             // Accumulate second order derivatives
             ixx += ix * ix;
@@ -189,8 +192,9 @@ template<typename T>
 void centroid_angle(const float* x_in, const float* y_in,
                     float* orientation_out, const unsigned total_feat,
                     CParam<T> image, const unsigned patch_size) {
-    const af::dim4 idims = image.dims();
-    const T* image_ptr   = image.get();
+    const af::dim4 idims    = image.dims();
+    const af::dim4 istrides = image.strides();
+    const T* image_ptr      = image.get();
     for (unsigned f = 0; f < total_feat; f++) {
         unsigned x = (unsigned)round(x_in[f]);
         unsigned y = (unsigned)round(y_in[f]);
@@ -205,7 +209,7 @@ void centroid_angle(const float* x_in, const float* y_in,
             int j = k % patch_size - r;
 
             // Calculate first order moments
-            T p = image_ptr[(x + i) * idims[0] + y + j];
+            T p = image_ptr[(x + i) * istrides[1] + (y + j) * istrides[0]];
             m01 += j * p;
             m10 += i * p;
         }
@@ -219,17 +223,17 @@ template<typename T>
 inline T get_pixel(unsigned x, unsigned y, const float ori, const unsigned size,
                    const int dist_x, const int dist_y, CParam<T> image,
                    const unsigned patch_size) {
-    const af::dim4 idims = image.dims();
-    const T* image_ptr   = image.get();
-    float ori_sin        = sin(ori);
-    float ori_cos        = cos(ori);
-    float patch_scl      = (float)size / (float)patch_size;
+    const af::dim4 istrides = image.strides();
+    const T* image_ptr      = image.get();
+    float ori_sin           = sin(ori);
+    float ori_cos           = cos(ori);
+    float patch_scl         = (float)size / (float)patch_size;
 
     // Calculate point coordinates based on orientation and size
     x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
     y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);
 
-    return image_ptr[x * idims[0] + y];
+    return image_ptr[x * istrides[1] + y * istrides[0]];
 }
 
 template<typename T>

diff --git a/src/backend/cuda/Kernel.cpp b/src/backend/cuda/Kernel.cpp
@@ -26,6 +26,67 @@ void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src,
     CU_CHECK(cuMemcpyDtoDAsync(dst, src, bytes, getActiveStream()));
 }
 
+void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src,
+                            size_t srcXInBytes, size_t bytes) {
+    CU_CHECK(cuMemcpyDtoDAsync(dst, src, bytes, getActiveStream()));
+}
+
+void Kernel::copyToReadOnly2D(Kernel::DevPtrType dst, Kernel::DevPtrType src,
+                              size_t srcXInBytes, size_t srcPitchInBytes,
+                              size_t height, size_t widthInBytes) {
+    CUDA_MEMCPY2D pCopy;
+    pCopy.srcXInBytes   = srcXInBytes;
+    pCopy.srcY          = 0;
+    pCopy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+    pCopy.srcDevice     = src;
+    pCopy.srcPitch      = srcPitchInBytes;
+
+    pCopy.dstXInBytes   = 0;
+    pCopy.dstY          = 0;
+    pCopy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    pCopy.dstDevice     = dst;
+    pCopy.dstPitch      = widthInBytes;
+
+    pCopy.WidthInBytes = widthInBytes;
+    pCopy.Height       = height;
+    // CUdeviceptr srcStart = srcDevice + srcY*srcPitch + srcXInBytes;
+    // CUdeviceptr dstStart = dstDevice + dstY*dstPitch + dstXInBytes;
+
+    CU_CHECK(cuMemcpy2DAsync(&pCopy, getActiveStream()));
+}
+
+void Kernel::copyToReadOnly3D(Kernel::DevPtrType dst, Kernel::DevPtrType src,
+                              size_t srcXInBytes, size_t srcPitchInBytes,
+                              size_t srcHeight, size_t depth, size_t height,
+                              size_t widthInBytes) {
+    CUDA_MEMCPY3D pCopy;
+    pCopy.srcXInBytes   = srcXInBytes;
+    pCopy.srcY          = 0;
+    pCopy.srcZ          = 0;
+    pCopy.srcLOD        = 0;
+    pCopy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+    pCopy.srcDevice     = src;
+    pCopy.srcPitch      = srcPitchInBytes;
+    pCopy.srcHeight     = srcHeight;
+
+    pCopy.dstXInBytes   = 0;
+    pCopy.dstY          = 0;
+    pCopy.dstZ          = 0;
+    pCopy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    pCopy.dstDevice     = dst;
+    pCopy.dstPitch      = widthInBytes;
+    pCopy.dstHeight     = height;
+
+    pCopy.WidthInBytes = widthInBytes;
+    pCopy.Height       = height;
+    pCopy.Depth        = depth;
+    // CUdeviceptr srcStart =
+    //      srcDevice + (srcZ*srcHeight+srcY)*srcPitch + srcXInBytes;
+    // CUdeviceptr dstStart =
+    //      dstDevice + (dstZ*dstHeight+dstY)*dstPitch + dstXInBytes;
+    CU_CHECK(cuMemcpy3DAsync(&pCopy, getActiveStream()));
+}
+
 void Kernel::setFlag(Kernel::DevPtrType dst, int* scalarValPtr,
                      const bool syncCopy) {
     CU_CHECK(