diff --git a/src/backend/common/KernelInterface.hpp b/src/backend/common/KernelInterface.hpp
index 0ead60a8cd..252f04f37f 100644
--- a/src/backend/common/KernelInterface.hpp
+++ b/src/backend/common/KernelInterface.hpp
@@ -64,6 +64,51 @@ class KernelInterface {
     virtual void copyToReadOnly(DevPtrType dst, DevPtrType src,
                                 size_t bytes) = 0;
 
+    /// \brief Copy data from device memory to read-only memory
+    ///
+    /// This function copies data of `bytes` size from the device pointer to a
+    /// read-only memory.
+    ///
+    /// \param[in] dst is the device pointer to which data will be copied
+    /// \param[in] src is the device pointer from which data will be copied
+    /// \param[in] srcXInBytes is offset in Bytes
+    /// \param[in] bytes are the number of bytes of data to be copied
+    virtual void copyToReadOnly(DevPtrType dst, DevPtrType src,
+                                size_t srcXInBytes, size_t bytes) = 0;
+
+    /// \brief Copy strided 2D data from device memory to read-only memory
+    ///
+    /// This function copies data of any 2D array from the device pointer to a
+    /// read-only memory.
+    ///
+    /// \param[in] dst is the device pointer to which data will be copied
+    /// \param[in] src is the device pointer from which data will be copied
+    /// \param[in] srcXInBytes is offset in Bytes
+    /// \param[in] srcPitchInBytes is strides[1] in Bytes
+    /// \param[in] height is the number of elements for dim[1] dst
+    /// \param[in] widthInBytes are #bytes of continous data to copy (dim[0])
+    virtual void copyToReadOnly2D(DevPtrType dst, DevPtrType src,
+                                  size_t srcXInBytes, size_t srcPitchInBytes,
+                                  size_t height, size_t widthInBytes) = 0;
+
+    /// \brief Copy strided 3D data from device memory to read-only memory
+    ///
+    /// This function copies data of any 3D array from the device pointer to a
+    /// read-only memory.
+    ///
+    /// \param[in] dst is the device pointer to which data will be copied
+    /// \param[in] src is the device pointer from which data will be copied
+    /// \param[in] srcXInBytes is offset in Bytes
+    /// \param[in] srcPitchInBytes is strides[1] in Bytes
+    /// \param[in] srcHeight is the number of elements ALLOCATED for dim[1] src
+    /// \param[in] depth is the number of elements for dim[2] dst
+    /// \param[in] height is the number of elements for dim[1] dst
+    /// \param[in] widthInBytes are #bytes of continous data to copy (dim[0])
+    virtual void copyToReadOnly3D(DevPtrType dst, DevPtrType src,
+                                  size_t srcXInBytes, size_t srcPitchInBytes,
+                                  size_t srcHeight, size_t depth, size_t height,
+                                  size_t widthInBytes) = 0;
+
     /// \brief Copy a single scalar to device memory
     ///
     /// This function copies a single value of type T from host variable
diff --git a/src/backend/cpu/kernel/convolve.hpp b/src/backend/cpu/kernel/convolve.hpp
index 62381dd749..5bb5478931 100644
--- a/src/backend/cpu/kernel/convolve.hpp
+++ b/src/backend/cpu/kernel/convolve.hpp
@@ -125,8 +125,8 @@ void one2one_3d(InT *optr, InT const *const iptr, AccT const *const fptr,
                 }
                 optr[koff + joff + i - iStart] = InT(accum);
             }  // i loop ends here
-        }      // j loop ends here
-    }          // k loop ends here
+        }  // j loop ends here
+    }  // k loop ends here
 }
 
 template<typename InT, typename AccT>
@@ -217,7 +217,6 @@ void convolve2_separable(InT *optr, InT const *const iptr,
                          dim_t fDim, af::dim4 const &oStrides,
                          af::dim4 const &sStrides, dim_t fStride) {
     UNUSED(orgDims);
-    UNUSED(sStrides);
     UNUSED(fStride);
     for (dim_t j = 0; j < oDims[1]; ++j) {
         dim_t jOff = j * oStrides[1];
@@ -237,14 +236,18 @@ void convolve2_separable(InT *optr, InT const *const iptr,
                     dim_t offi     = ci - f;
                     bool isCIValid = offi >= 0 && offi < sDims[0];
                     bool isCJValid = cj >= 0 && cj < sDims[1];
-                    s_val = (isCJValid && isCIValid ? iptr[cj * sDims[0] + offi]
-                                                    : scalar<InT>(0));
+                    s_val =
+                        (isCJValid && isCIValid ? iptr[cj * sStrides.dims[1] +
+                                                       offi * sStrides.dims[0]]
+                                                : scalar<InT>(0));
                 } else {
                     dim_t offj     = cj - f;
                     bool isCIValid = ci >= 0 && ci < sDims[0];
                     bool isCJValid = offj >= 0 && offj < sDims[1];
-                    s_val = (isCJValid && isCIValid ? iptr[offj * sDims[0] + ci]
-                                                    : scalar<InT>(0));
+                    s_val =
+                        (isCJValid && isCIValid ? iptr[offj * sStrides.dims[1] +
+                                                       ci * sStrides.dims[0]]
+                                                : scalar<InT>(0));
                 }
 
                 accum += AccT(s_val * f_val);
diff --git a/src/backend/cpu/kernel/fast.hpp b/src/backend/cpu/kernel/fast.hpp
index 341ddbe701..7927dba060 100644
--- a/src/backend/cpu/kernel/fast.hpp
+++ b/src/backend/cpu/kernel/fast.hpp
@@ -28,7 +28,9 @@ inline int idx_x(int i) {
     return idx_y(i - 12);
 }
 
-inline int idx(int y, int x, unsigned idim0) { return x * idim0 + y; }
+inline int idx(int y, int x, unsigned istrides0, unsigned istrides1) {
+    return x * istrides1 + y * istrides0;
+}
 
 // test_greater()
 // Tests if a pixel x > p + thr
@@ -44,9 +46,10 @@ inline int test_smaller(float x, float p, float thr) { return (x < p - thr); }
 // Returns  1 when x > p + thr
 template<typename T>
 inline int test_pixel(const T *image, const float p, float thr, int y, int x,
-                      unsigned idim0) {
-    return -test_smaller((float)image[idx(y, x, idim0)], p, thr) +
-           test_greater((float)image[idx(y, x, idim0)], p, thr);
+                      unsigned istrides0, unsigned istrides1) {
+    return -test_smaller((float)image[idx(y, x, istrides0, istrides1)], p,
+                         thr) +
+           test_greater((float)image[idx(y, x, istrides0, istrides1)], p, thr);
 }
 
 // abs_diff()
@@ -64,36 +67,53 @@ void locate_features(CParam<T> in, Param<float> score, Param<float> x_out,
                      unsigned *count, float const thr,
                      unsigned const arc_length, unsigned const nonmax,
                      unsigned const max_feat, unsigned const edge) {
-    af::dim4 in_dims = in.dims();
-    T const *in_ptr  = in.get();
+    af::dim4 in_dims    = in.dims();
+    af::dim4 in_strides = in.strides();
+    T const *in_ptr     = in.get();
 
     for (int y = edge; y < (int)(in_dims[0] - edge); y++) {
         for (int x = edge; x < (int)(in_dims[1] - edge); x++) {
-            float p = in_ptr[idx(y, x, in_dims[0])];
+            float p = in_ptr[idx(y, x, in_strides[0], in_strides[1])];
 
             // Start by testing opposite pixels of the circle that will result
             // in a non-kepoint
             int d;
-            d = test_pixel<T>(in_ptr, p, thr, y - 3, x, in_dims[0]) |
-                test_pixel<T>(in_ptr, p, thr, y + 3, x, in_dims[0]);
+            d = test_pixel<T>(in_ptr, p, thr, y - 3, x, in_strides[0],
+                              in_strides[1]) |
+                test_pixel<T>(in_ptr, p, thr, y + 3, x, in_strides[0],
+                              in_strides[1]);
             if (d == 0) continue;
 
-            d &= test_pixel<T>(in_ptr, p, thr, y - 2, x + 2, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y + 2, x - 2, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y, x + 3, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y, x - 3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y + 2, x + 2, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y - 2, x - 2, in_dims[0]);
+            d &= test_pixel<T>(in_ptr, p, thr, y - 2, x + 2, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y + 2, x - 2, in_strides[0],
+                               in_strides[1]);
+            d &= test_pixel<T>(in_ptr, p, thr, y, x + 3, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y, x - 3, in_strides[0],
+                               in_strides[1]);
+            d &= test_pixel<T>(in_ptr, p, thr, y + 2, x + 2, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y - 2, x - 2, in_strides[0],
+                               in_strides[1]);
             if (d == 0) continue;
 
-            d &= test_pixel<T>(in_ptr, p, thr, y - 3, x + 1, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y + 3, x - 1, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y - 1, x + 3, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y + 1, x - 3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y + 1, x + 3, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y - 1, x - 3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y + 3, x + 1, in_dims[0]) |
-                 test_pixel<T>(in_ptr, p, thr, y - 3, x - 1, in_dims[0]);
+            d &= test_pixel<T>(in_ptr, p, thr, y - 3, x + 1, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y + 3, x - 1, in_strides[0],
+                               in_strides[1]);
+            d &= test_pixel<T>(in_ptr, p, thr, y - 1, x + 3, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y + 1, x - 3, in_strides[0],
+                               in_strides[1]);
+            d &= test_pixel<T>(in_ptr, p, thr, y + 1, x + 3, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y - 1, x - 3, in_strides[0],
+                               in_strides[1]);
+            d &= test_pixel<T>(in_ptr, p, thr, y + 3, x + 1, in_strides[0],
+                               in_strides[1]) |
+                 test_pixel<T>(in_ptr, p, thr, y - 3, x - 1, in_strides[0],
+                               in_strides[1]);
             if (d == 0) continue;
 
             int sum = 0;
@@ -101,7 +121,7 @@ void locate_features(CParam<T> in, Param<float> score, Param<float> x_out,
             // Sum responses [-1, 0 or 1] of first arc_length pixels
             for (int i = 0; i < static_cast<int>(arc_length); i++)
                 sum += test_pixel<T>(in_ptr, p, thr, y + idx_y(i), x + idx_x(i),
-                                     in_dims[0]);
+                                     in_strides[0], in_strides[1]);
 
             // Test maximum and mininmum responses of first segment of
             // arc_length pixels
@@ -113,9 +133,10 @@ void locate_features(CParam<T> in, Param<float> score, Param<float> x_out,
             // circle
             for (int i = arc_length; i < 16; i++) {
                 sum -= test_pixel<T>(in_ptr, p, thr, y + idx_y(i - arc_length),
-                                     x + idx_x(i - arc_length), in_dims[0]);
+                                     x + idx_x(i - arc_length), in_strides[0],
+                                     in_strides[1]);
                 sum += test_pixel<T>(in_ptr, p, thr, y + idx_y(i), x + idx_x(i),
-                                     in_dims[0]);
+                                     in_strides[0], in_strides[1]);
                 max_sum = std::max(max_sum, sum);
                 min_sum = std::min(min_sum, sum);
             }
@@ -123,19 +144,20 @@ void locate_features(CParam<T> in, Param<float> score, Param<float> x_out,
             // To completely test all possible segments, it's necessary to test
             // segments that include the top junction of the circle
             for (int i = 0; i < static_cast<int>(arc_length - 1); i++) {
-                sum -= test_pixel<T>(
-                    in_ptr, p, thr, y + idx_y(16 - arc_length + i),
-                    x + idx_x(16 - arc_length + i), in_dims[0]);
+                sum -= test_pixel<T>(in_ptr, p, thr,
+                                     y + idx_y(16 - arc_length + i),
+                                     x + idx_x(16 - arc_length + i),
+                                     in_strides[0], in_strides[1]);
                 sum += test_pixel<T>(in_ptr, p, thr, y + idx_y(i), x + idx_x(i),
-                                     in_dims[0]);
+                                     in_strides[0], in_strides[1]);
                 max_sum = std::max(max_sum, sum);
                 min_sum = std::min(min_sum, sum);
             }
 
             float s_bright = 0, s_dark = 0;
             for (int i = 0; i < 16; i++) {
-                float p_x =
-                    (float)in_ptr[idx(y + idx_y(i), x + idx_x(i), in_dims[0])];
+                float p_x = (float)in_ptr[idx(y + idx_y(i), x + idx_x(i),
+                                              in_strides[0], in_strides[1])];
 
                 s_bright +=
                     test_greater(p_x, p, thr) * (abs_diff(p_x, p) - thr);
@@ -159,7 +181,7 @@ void locate_features(CParam<T> in, Param<float> score, Param<float> x_out,
                         static_cast<float>(std::max(s_bright, s_dark));
                     if (nonmax == 1) {
                         float *score_ptr = score.get();
-                        score_ptr[idx(y, x, in_dims[0])] =
+                        score_ptr[idx(y, x, 1, in_dims[0])] =
                             std::max(s_bright, s_dark);
                     }
                 }
diff --git a/src/backend/cpu/kernel/orb.hpp b/src/backend/cpu/kernel/orb.hpp
index 385f71abb6..708caba7f3 100644
--- a/src/backend/cpu/kernel/orb.hpp
+++ b/src/backend/cpu/kernel/orb.hpp
@@ -119,8 +119,9 @@ void harris_response(float* x_out, float* y_out, float* score_out,
                      unsigned* usable_feat, CParam<T> image,
                      const unsigned block_size, const float k_thr,
                      const unsigned patch_size) {
-    const af::dim4 idims = image.dims();
-    const T* image_ptr   = image.get();
+    const af::dim4 idims    = image.dims();
+    const af::dim4 istrides = image.strides();
+    const T* image_ptr      = image.get();
     for (unsigned f = 0; f < total_feat; f++) {
         unsigned x, y;
         float scl = 1.f;
@@ -154,10 +155,12 @@ void harris_response(float* x_out, float* y_out, float* score_out,
             int j = k % block_size - r;
 
             // Calculate local x and y derivatives
-            float ix = image_ptr[(x + i + 1) * idims[0] + y + j] -
-                       image_ptr[(x + i - 1) * idims[0] + y + j];
-            float iy = image_ptr[(x + i) * idims[0] + y + j + 1] -
-                       image_ptr[(x + i) * idims[0] + y + j - 1];
+            float ix =
+                image_ptr[(x + i + 1) * istrides[1] + (y + j) * istrides[0]] -
+                image_ptr[(x + i - 1) * istrides[1] + (y + j) * istrides[0]];
+            float iy =
+                image_ptr[(x + i) * istrides[1] + (y + j + 1) * istrides[0]] -
+                image_ptr[(x + i) * istrides[1] + (y + j - 1) * istrides[0]];
 
             // Accumulate second order derivatives
             ixx += ix * ix;
@@ -189,8 +192,9 @@ template<typename T>
 void centroid_angle(const float* x_in, const float* y_in,
                     float* orientation_out, const unsigned total_feat,
                     CParam<T> image, const unsigned patch_size) {
-    const af::dim4 idims = image.dims();
-    const T* image_ptr   = image.get();
+    const af::dim4 idims    = image.dims();
+    const af::dim4 istrides = image.strides();
+    const T* image_ptr      = image.get();
     for (unsigned f = 0; f < total_feat; f++) {
         unsigned x = (unsigned)round(x_in[f]);
         unsigned y = (unsigned)round(y_in[f]);
@@ -205,7 +209,7 @@ void centroid_angle(const float* x_in, const float* y_in,
             int j = k % patch_size - r;
 
             // Calculate first order moments
-            T p = image_ptr[(x + i) * idims[0] + y + j];
+            T p = image_ptr[(x + i) * istrides[1] + (y + j) * istrides[0]];
             m01 += j * p;
             m10 += i * p;
         }
@@ -219,17 +223,17 @@ template<typename T>
 inline T get_pixel(unsigned x, unsigned y, const float ori, const unsigned size,
                    const int dist_x, const int dist_y, CParam<T> image,
                    const unsigned patch_size) {
-    const af::dim4 idims = image.dims();
-    const T* image_ptr   = image.get();
-    float ori_sin        = sin(ori);
-    float ori_cos        = cos(ori);
-    float patch_scl      = (float)size / (float)patch_size;
+    const af::dim4 istrides = image.strides();
+    const T* image_ptr      = image.get();
+    float ori_sin           = sin(ori);
+    float ori_cos           = cos(ori);
+    float patch_scl         = (float)size / (float)patch_size;
 
     // Calculate point coordinates based on orientation and size
     x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
     y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);
 
-    return image_ptr[x * idims[0] + y];
+    return image_ptr[x * istrides[1] + y * istrides[0]];
 }
 
 template<typename T>
diff --git a/src/backend/cuda/Kernel.cpp b/src/backend/cuda/Kernel.cpp
index d72672a1fc..25968fb646 100644
--- a/src/backend/cuda/Kernel.cpp
+++ b/src/backend/cuda/Kernel.cpp
@@ -26,6 +26,67 @@ void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src,
     CU_CHECK(cuMemcpyDtoDAsync(dst, src, bytes, getActiveStream()));
 }
 
+void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src,
+                            size_t srcXInBytes, size_t bytes) {
+    CU_CHECK(cuMemcpyDtoDAsync(dst, src, bytes, getActiveStream()));
+}
+
+void Kernel::copyToReadOnly2D(Kernel::DevPtrType dst, Kernel::DevPtrType src,
+                              size_t srcXInBytes, size_t srcPitchInBytes,
+                              size_t height, size_t widthInBytes) {
+    CUDA_MEMCPY2D pCopy;
+    pCopy.srcXInBytes   = srcXInBytes;
+    pCopy.srcY          = 0;
+    pCopy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+    pCopy.srcDevice     = src;
+    pCopy.srcPitch      = srcPitchInBytes;
+
+    pCopy.dstXInBytes   = 0;
+    pCopy.dstY          = 0;
+    pCopy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    pCopy.dstDevice     = dst;
+    pCopy.dstPitch      = widthInBytes;
+
+    pCopy.WidthInBytes = widthInBytes;
+    pCopy.Height       = height;
+    // CUdeviceptr srcStart = srcDevice + srcY*srcPitch + srcXInBytes;
+    // CUdeviceptr dstStart = dstDevice + dstY*dstPitch + dstXInBytes;
+
+    CU_CHECK(cuMemcpy2DAsync(&pCopy, getActiveStream()));
+}
+
+void Kernel::copyToReadOnly3D(Kernel::DevPtrType dst, Kernel::DevPtrType src,
+                              size_t srcXInBytes, size_t srcPitchInBytes,
+                              size_t srcHeight, size_t depth, size_t height,
+                              size_t widthInBytes) {
+    CUDA_MEMCPY3D pCopy;
+    pCopy.srcXInBytes   = srcXInBytes;
+    pCopy.srcY          = 0;
+    pCopy.srcZ          = 0;
+    pCopy.srcLOD        = 0;
+    pCopy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+    pCopy.srcDevice     = src;
+    pCopy.srcPitch      = srcPitchInBytes;
+    pCopy.srcHeight     = srcHeight;
+
+    pCopy.dstXInBytes   = 0;
+    pCopy.dstY          = 0;
+    pCopy.dstZ          = 0;
+    pCopy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    pCopy.dstDevice     = dst;
+    pCopy.dstPitch      = widthInBytes;
+    pCopy.dstHeight     = height;
+
+    pCopy.WidthInBytes = widthInBytes;
+    pCopy.Height       = height;
+    pCopy.Depth        = depth;
+    // CUdeviceptr srcStart =
+    //      srcDevice + (srcZ*srcHeight+srcY)*srcPitch + srcXInBytes;
+    // CUdeviceptr dstStart =
+    //      dstDevice + (dstZ*dstHeight+dstY)*dstPitch + dstXInBytes;
+    CU_CHECK(cuMemcpy3DAsync(&pCopy, getActiveStream()));
+}
+
 void Kernel::setFlag(Kernel::DevPtrType dst, int* scalarValPtr,
                      const bool syncCopy) {
     CU_CHECK(
diff --git a/src/backend/cuda/Kernel.hpp b/src/backend/cuda/Kernel.hpp
index 2199292080..2d5ff6eb39 100644
--- a/src/backend/cuda/Kernel.hpp
+++ b/src/backend/cuda/Kernel.hpp
@@ -66,6 +66,18 @@ class Kernel
 
     void copyToReadOnly(DevPtrType dst, DevPtrType src, size_t bytes) final;
 
+    void copyToReadOnly(DevPtrType dst, DevPtrType src, size_t srcXInBytes,
+                        size_t bytes) final;
+
+    void copyToReadOnly2D(DevPtrType dst, DevPtrType src, size_t srcXInBytes,
+                          size_t srcPitchInBytes, size_t height,
+                          size_t widthInBytes) final;
+
+    void copyToReadOnly3D(DevPtrType dst, DevPtrType src, size_t srcXInBytes,
+                          size_t srcPitchInBytes, size_t srcHeight,
+                          size_t depth, size_t height,
+                          size_t widthInBytes) final;
+
     void setFlag(DevPtrType dst, int* scalarValPtr,
                  const bool syncCopy = false) final;
 
diff --git a/src/backend/cuda/kernel/convolve.hpp b/src/backend/cuda/kernel/convolve.hpp
index 38339f2de2..0af7ade897 100644
--- a/src/backend/cuda/kernel/convolve.hpp
+++ b/src/backend/cuda/kernel/convolve.hpp
@@ -120,7 +120,6 @@ void convolve_1d(conv_kparam_t& p, Param<T> out, CParam<T> sig, CParam<aT> filt,
                 int f1Off      = b1 * filt.strides[1];
                 const aT* fptr = filt.ptr + (f1Off + f2Off + f3Off);
 
-                // FIXME: case where filter array is strided
                 auto constMemPtr = convolve1.getDevPtr(conv_c_name);
                 convolve1.copyToReadOnly(constMemPtr,
                                          reinterpret_cast<CUdeviceptr>(fptr),
@@ -145,28 +144,40 @@ void convolve_1d(conv_kparam_t& p, Param<T> out, CParam<T> sig, CParam<aT> filt,
 
 template<typename T, typename aT>
 void conv2Helper(const conv_kparam_t& p, Param<T> out, CParam<T> sig,
-                 const aT* fptr, int f0, int f1, const bool expand) {
-    const bool isFilterSizeLt5  = (f0 <= 5 && f1 <= 5);
-    const bool isFilterGt5AndSq = (f0 == f1 && f0 > 5 && f0 < 18);
+                 const aT* fptr, const dim_t* const fdims,
+                 const dim_t* const fstrides, const bool expand) {
+    const bool isFilterSizeLt5 = (fdims[0] <= 5 && fdims[1] <= 5);
+    const bool isFilterGt5AndSq =
+        (fdims[0] == fdims[1] && fdims[0] > 5 && fdims[0] < 18);
 
     if (!(isFilterSizeLt5 || isFilterGt5AndSq)) {
         char errMessage[256];
         snprintf(errMessage, sizeof(errMessage),
-                 "\nCUDA Convolution doesn't support %dx%d kernel\n", f0, f1);
+                 "\nCUDA Convolution doesn't support %lldx%lld kernel\n",
+                 fdims[0], fdims[1]);
         CUDA_NOT_SUPPORTED(errMessage);
     }
 
     auto convolve2 = common::getKernel(
         "arrayfire::cuda::convolve2", {{convolve2_cuh_src}},
         TemplateArgs(TemplateTypename<T>(), TemplateTypename<aT>(),
-                     TemplateArg(expand), TemplateArg(f0), TemplateArg(f1)),
+                     TemplateArg(expand), TemplateArg(fdims[0]),
+                     TemplateArg(fdims[1])),
         {{DefineValue(MAX_CONV1_FILTER_LEN), DefineValue(CONV_THREADS),
           DefineValue(CONV2_THREADS_X), DefineValue(CONV2_THREADS_Y)}});
 
-    // FIXME: case where filter array is strided
     auto constMemPtr = convolve2.getDevPtr(conv_c_name);
-    convolve2.copyToReadOnly(constMemPtr, reinterpret_cast<CUdeviceptr>(fptr),
-                             f0 * f1 * sizeof(aT));
+    if (fstrides[1] == fdims[0]) {
+        // linear filter array
+        convolve2.copyToReadOnly(constMemPtr,
+                                 reinterpret_cast<CUdeviceptr>(fptr),
+                                 fdims[0] * fdims[1] * sizeof(aT));
+    } else {
+        // strided filter array
+        convolve2.copyToReadOnly2D(
+            constMemPtr, reinterpret_cast<CUdeviceptr>(fptr), 0,
+            fstrides[1] * sizeof(aT), fdims[1], fdims[0] * sizeof(aT));
+    }
 
     EnqueueArgs qArgs(p.mBlocks, p.mThreads, getActiveStream());
     convolve2(qArgs, out, sig, p.mBlk_x, p.mBlk_y, p.o[1], p.o[2], p.s[1],
@@ -192,7 +203,7 @@ void convolve_2d(conv_kparam_t& p, Param<T> out, CParam<T> sig, CParam<aT> filt,
             p.s[1] = (p.inHasNoOffset ? 0 : b2);
             p.s[2] = (p.inHasNoOffset ? 0 : b3);
 
-            conv2Helper<T, aT>(p, out, sig, fptr, filt.dims[0], filt.dims[1],
+            conv2Helper<T, aT>(p, out, sig, fptr, filt.dims, filt.strides,
                                expand);
         }
     }
@@ -218,11 +229,18 @@ void convolve_3d(conv_kparam_t& p, Param<T> out, CParam<T> sig, CParam<aT> filt,
 
         const aT* fptr = filt.ptr + f3Off;
 
-        // FIXME: case where filter array is strided
         auto constMemPtr = convolve3.getDevPtr(conv_c_name);
-        convolve3.copyToReadOnly(
-            constMemPtr, reinterpret_cast<CUdeviceptr>(fptr), filterSize);
-
+        if (filt.strides[2] == filt.dims[0] * filt.dims[1]) {
+            // linear filter array
+            convolve3.copyToReadOnly(
+                constMemPtr, reinterpret_cast<CUdeviceptr>(fptr), filterSize);
+        } else {
+            // strided filter array
+            convolve3.copyToReadOnly3D(
+                constMemPtr, reinterpret_cast<CUdeviceptr>(fptr), 0,
+                filt.strides[1] * sizeof(aT), filt.strides[2] / filt.strides[1],
+                filt.dims[2], filt.dims[1], filt.dims[0] * sizeof(aT));
+        }
         p.o[2] = (p.outHasNoOffset ? 0 : b3);
         p.s[2] = (p.inHasNoOffset ? 0 : b3);
 
@@ -321,11 +339,26 @@ void convolve2(Param<T> out, CParam<T> signal, CParam<aT> filter, int conv_dim,
 
     dim3 blocks(blk_x * signal.dims[2], blk_y * signal.dims[3]);
 
-    // FIXME: case where filter array is strided
     auto constMemPtr = convolve2_separable.getDevPtr(sconv_c_name);
-    convolve2_separable.copyToReadOnly(
-        constMemPtr, reinterpret_cast<CUdeviceptr>(filter.ptr),
-        fLen * sizeof(aT));
+    if (filter.strides[1] == filter.dims[0]) {
+        // linear filter array
+        convolve2_separable.copyToReadOnly(
+            constMemPtr, reinterpret_cast<CUdeviceptr>(filter.ptr),
+            fLen * sizeof(aT));
+    } else {
+        // strided filter array 4D
+        const aT* fptr   = filter.ptr;
+        CUdeviceptr dptr = constMemPtr;
+        for (dim_t d3 = 0; d3 < filter.dims[3]; ++d3, fptr += filter.strides[3],
+                   dptr += filter.dims[0] * filter.dims[1] * filter.dims[2] *
+                                                sizeof(aT)) {
+            convolve2_separable.copyToReadOnly3D(
+                dptr, reinterpret_cast<CUdeviceptr>(fptr), 0,
+                filter.strides[1] * sizeof(aT),
+                filter.strides[2] + filter.strides[1], filter.dims[2],
+                filter.dims[1], filter.dims[0] * sizeof(aT));
+        }
+    }
 
     EnqueueArgs qArgs(blocks, threads, getActiveStream());
     convolve2_separable(qArgs, out, signal, blk_x, blk_y);
diff --git a/src/backend/cuda/kernel/fast.hpp b/src/backend/cuda/kernel/fast.hpp
index 7b54162b42..c98726d0a1 100644
--- a/src/backend/cuda/kernel/fast.hpp
+++ b/src/backend/cuda/kernel/fast.hpp
@@ -177,16 +177,17 @@ __device__ void load_shared_image(CParam<T> in, T *local_image, unsigned ix,
     // Copy an image patch to shared memory, with a 3-pixel edge
     if (ix < lx && iy < ly && x - 3 < in.dims[0] && y - 3 < in.dims[1]) {
         local_image[(ix) + (bx + 6) * (iy)] =
-            in.ptr[(x - 3) + in.dims[0] * (y - 3)];
+            in.ptr[in.strides[0] * (x - 3) + in.strides[1] * (y - 3)];
         if (x + lx - 3 < in.dims[0])
             local_image[(ix + lx) + (bx + 6) * (iy)] =
-                in.ptr[(x + lx - 3) + in.dims[0] * (y - 3)];
+                in.ptr[in.strides[0] * (x + lx - 3) + in.strides[1] * (y - 3)];
         if (y + ly - 3 < in.dims[1])
             local_image[(ix) + (bx + 6) * (iy + ly)] =
-                in.ptr[(x - 3) + in.dims[0] * (y + ly - 3)];
+                in.ptr[in.strides[0] * (x - 3) + in.strides[1] * (y + ly - 3)];
         if (x + lx - 3 < in.dims[0] && y + ly - 3 < in.dims[1])
             local_image[(ix + lx) + (bx + 6) * (iy + ly)] =
-                in.ptr[(x + lx - 3) + in.dims[0] * (y + ly - 3)];
+                in.ptr[in.strides[0] * (x + lx - 3) +
+                       in.strides[1] * (y + ly - 3)];
     }
 }
 
diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp
index e956f02441..072657550d 100644
--- a/src/backend/cuda/kernel/harris.hpp
+++ b/src/backend/cuda/kernel/harris.hpp
@@ -174,7 +174,7 @@ void harris(unsigned* corners_out, float** x_out, float** y_out,
         filter.strides[k] = filter.dims[k - 1] * filter.strides[k - 1];
     }
 
-    int filter_elem   = filter.strides[3] * filter.dims[3];
+    int filter_elem   = filter.dims[0] * filter.dims[1];
     auto filter_alloc = memAlloc<convAccT>(filter_elem);
     filter.ptr        = filter_alloc.get();
     CUDA_CHECK(cudaMemcpyAsync(filter.ptr, h_filter.data(),
@@ -184,9 +184,11 @@ void harris(unsigned* corners_out, float** x_out, float** y_out,
     const unsigned border_len = filter_len / 2 + 1;
 
     Param<T> ix, iy;
-    for (dim_t i = 0; i < 4; i++) {
+    ix.dims[0] = iy.dims[0] = in.dims[0];
+    ix.strides[0] = iy.strides[0] = 1;
+    for (dim_t i = 1; i < 4; i++) {
         ix.dims[i] = iy.dims[i] = in.dims[i];
-        ix.strides[i] = iy.strides[i] = in.strides[i];
+        ix.strides[i] = iy.strides[i] = ix.strides[i - 1] * ix.dims[i - 1];
     }
     auto ix_alloc = memAlloc<T>(ix.dims[3] * ix.strides[3]);
     auto iy_alloc = memAlloc<T>(iy.dims[3] * iy.strides[3]);
@@ -198,12 +200,16 @@ void harris(unsigned* corners_out, float** x_out, float** y_out,
 
     Param<T> ixx, ixy, iyy;
     Param<T> ixx_tmp, ixy_tmp, iyy_tmp;
-    for (dim_t i = 0; i < 4; i++) {
-        ixx.dims[i] = ixy.dims[i] = iyy.dims[i] = in.dims[i];
-        ixx_tmp.dims[i] = ixy_tmp.dims[i] = iyy_tmp.dims[i] = in.dims[i];
-        ixx.strides[i] = ixy.strides[i] = iyy.strides[i] = in.strides[i];
-        ixx_tmp.strides[i] = ixy_tmp.strides[i] = iyy_tmp.strides[i] =
-            in.strides[i];
+    ixx.dims[0] = ixy.dims[0] = iyy.dims[0] = ixx_tmp.dims[0] =
+        ixy_tmp.dims[0] = iyy_tmp.dims[0] = in.dims[0];
+    ixx.strides[0] = ixy.strides[0] = iyy.strides[0] = ixx_tmp.strides[0] =
+        ixy_tmp.strides[0] = iyy_tmp.strides[0] = 1;
+    for (dim_t i = 1; i < 4; i++) {
+        ixx.dims[i] = ixy.dims[i] = iyy.dims[i] = ixx_tmp.dims[i] =
+            ixy_tmp.dims[i] = iyy_tmp.dims[i] = in.dims[i];
+        ixx.strides[i] = ixy.strides[i] = iyy.strides[i] = ixx_tmp.strides[i] =
+            ixy_tmp.strides[i]                           = iyy_tmp.strides[i] =
+                ixx.strides[i - 1] * ixx.dims[i - 1];
     }
     auto ixx_alloc = memAlloc<T>(ixx.dims[3] * ixx.strides[3]);
     auto ixy_alloc = memAlloc<T>(ixy.dims[3] * ixy.strides[3]);
@@ -214,9 +220,9 @@ void harris(unsigned* corners_out, float** x_out, float** y_out,
 
     // Compute second-order derivatives
     dim3 threads(THREADS_PER_BLOCK, 1);
-    dim3 blocks(divup(in.dims[3] * in.strides[3], threads.x), 1);
+    dim3 blocks(divup(in.dims[0] * in.dims[1], threads.x), 1);
     CUDA_LAUNCH((second_order_deriv<T>), blocks, threads, ixx.ptr, ixy.ptr,
-                iyy.ptr, in.dims[3] * in.strides[3], ix.ptr, iy.ptr);
+                iyy.ptr, in.dims[0] * in.dims[1], ix.ptr, iy.ptr);
 
     auto ixx_tmp_alloc = memAlloc<T>(ixx_tmp.dims[3] * ixx_tmp.strides[3]);
     auto ixy_tmp_alloc = memAlloc<T>(ixy_tmp.dims[3] * ixy_tmp.strides[3]);
@@ -235,7 +241,7 @@ void harris(unsigned* corners_out, float** x_out, float** y_out,
 
     // Number of corners is not known a priori, limit maximum number of corners
     // according to image dimensions
-    unsigned corner_lim = in.dims[3] * in.strides[3] * 0.2f;
+    unsigned corner_lim = in.dims[0] * in.dims[1] * 0.2f;
 
     auto d_corners_found = memAlloc<unsigned>(1);
     CUDA_CHECK(cudaMemsetAsync(d_corners_found.get(), 0, sizeof(unsigned),
@@ -245,7 +251,7 @@ void harris(unsigned* corners_out, float** x_out, float** y_out,
     auto d_y_corners    = memAlloc<float>(corner_lim);
     auto d_resp_corners = memAlloc<float>(corner_lim);
 
-    auto d_responses = memAlloc<T>(in.dims[3] * in.strides[3]);
+    auto d_responses = memAlloc<T>(in.dims[0] * in.dims[1]);
 
     // Calculate Harris responses for all pixels
     threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
index c1df7620f5..e7da4eadb9 100644
--- a/src/backend/cuda/kernel/orb.hpp
+++ b/src/backend/cuda/kernel/orb.hpp
@@ -125,10 +125,14 @@ __global__ void harris_response(float* score_out, float* size_out,
             int j = k % block_size - r;
 
             // Calculate local x and y derivatives
-            float ix = image.ptr[(x + i + 1) * image.dims[0] + y + j] -
-                       image.ptr[(x + i - 1) * image.dims[0] + y + j];
-            float iy = image.ptr[(x + i) * image.dims[0] + y + j + 1] -
-                       image.ptr[(x + i) * image.dims[0] + y + j - 1];
+            float ix = image.ptr[(x + i + 1) * image.strides[1] +
+                                 (y + j) * image.strides[0]] -
+                       image.ptr[(x + i - 1) * image.strides[1] +
+                                 (y + j) * image.strides[0]];
+            float iy = image.ptr[(x + i) * image.strides[1] +
+                                 (y + j + 1) * image.strides[0]] -
+                       image.ptr[(x + i) * image.strides[1] +
+                                 (y + j - 1) * image.strides[0]];
 
             // Accumulate second order derivatives
             ixx += ix * ix;
@@ -181,7 +185,8 @@ __global__ void centroid_angle(const float* x_in, const float* y_in,
             int j = k % patch_size - r;
 
             // Calculate first order moments
-            T p = image.ptr[(x + i) * image.dims[0] + y + j];
+            T p = image.ptr[(x + i) * image.strides[1] +
+                            (y + j) * image.strides[0]];
             m01 += j * p;
             m10 += i * p;
         }
@@ -209,7 +214,7 @@ inline __device__ T get_pixel(unsigned x, unsigned y, const float ori,
     x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
     y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);
 
-    return image.ptr[x * image.dims[0] + y];
+    return image.ptr[x * image.strides[1] + y * image.strides[0]];
 }
 
 inline __device__ int lookup(const int n, cudaTextureObject_t tex) {
diff --git a/src/backend/oneapi/kernel/convolve.hpp b/src/backend/oneapi/kernel/convolve.hpp
index ebec7dbe88..b582116726 100644
--- a/src/backend/oneapi/kernel/convolve.hpp
+++ b/src/backend/oneapi/kernel/convolve.hpp
@@ -13,6 +13,7 @@
 #include <common/kernel_cache.hpp>
 #include <debug_oneapi.hpp>
 #include <kernel/accessors.hpp>
+#include <kernel/memcopy.hpp>
 #include <af/defines.h>
 
 #include <sycl/sycl.hpp>
@@ -75,7 +76,7 @@ void prepareKernelArgs(conv_kparam_t<aT> &param, dim_t *oDims,
         param.nBBS0    = divup(oDims[0], THREADS);
         param.nBBS1    = batchDims[2];
         param.global   = range<3>(param.nBBS0 * THREADS * batchDims[1],
-                                param.nBBS1 * batchDims[3], 1);
+                                  param.nBBS1 * batchDims[3], 1);
         param.loc_size = (THREADS + 2 * (fDims[0] - 1));
     } else if (rank == 2) {
         param.local  = range<3>{THREADS_X, THREADS_Y, 1};
@@ -89,26 +90,13 @@ void prepareKernelArgs(conv_kparam_t<aT> &param, dim_t *oDims,
         param.nBBS1    = divup(oDims[1], CUBE_Y);
         int blk_z      = divup(oDims[2], CUBE_Z);
         param.global   = range<3>(param.nBBS0 * CUBE_X * batchDims[3],
-                                param.nBBS1 * CUBE_Y, blk_z * CUBE_Z);
+                                  param.nBBS1 * CUBE_Y, blk_z * CUBE_Z);
         param.loc_size = (CUBE_X + 2 * (fDims[0] - 1)) *
                          (CUBE_Y + 2 * (fDims[1] - 1)) *
                          (CUBE_Z + 2 * (fDims[2] - 1));
     }
 }
 
-template<typename T>
-void memcpyBuffer(sycl::buffer<T, 1> &dest, sycl::buffer<T, 1> &src,
-                  const size_t n, const size_t srcOffset) {
-    getQueue().submit([&](auto &h) {
-        sycl::accessor srcAcc{src, h, sycl::range{n}, sycl::id{srcOffset},
-                              sycl::read_only};
-        sycl::accessor destAcc{
-            dest,         h, sycl::range{n}, sycl::id{0}, sycl::write_only,
-            sycl::no_init};
-        h.copy(srcAcc, destAcc);
-    });
-}
-
 #include "convolve1.hpp"
 #include "convolve2.hpp"
 #include "convolve3.hpp"
diff --git a/src/backend/oneapi/kernel/convolve1.hpp b/src/backend/oneapi/kernel/convolve1.hpp
index 41c6facae6..7f41d29d20 100644
--- a/src/backend/oneapi/kernel/convolve1.hpp
+++ b/src/backend/oneapi/kernel/convolve1.hpp
@@ -134,20 +134,23 @@ void conv1Helper(const conv_kparam_t<aT> &param, Param<T> &out,
 template<typename T, typename aT>
 void conv1(conv_kparam_t<aT> &p, Param<T> &out, const Param<T> &sig,
            const Param<aT> &filt, const bool expand) {
-    const size_t se_size = filt.info.dims[0];
-    sycl::buffer<aT> impulse{sycl::range(filt.info.dims[0])};
-    int f0Off = filt.info.offset;
+    const dim_t se_size = filt.info.dims[0];
+    sycl::buffer<aT> impulse{sycl::range(se_size)};
+    const dim_t mstrides[4] = {1, se_size, se_size, se_size};
+    const dim_t mdims[4]    = {filt.info.dims[0], 1, 1, 1};
+    const dim_t f0Off       = filt.info.offset;
     for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) {
-        int f3Off = b3 * filt.info.strides[3];
+        const dim_t f3Off = b3 * filt.info.strides[3];
 
         for (int b2 = 0; b2 < filt.info.dims[2]; ++b2) {
-            int f2Off = b2 * filt.info.strides[2];
+            const dim_t f2Off = b2 * filt.info.strides[2];
 
             for (int b1 = 0; b1 < filt.info.dims[1]; ++b1) {
-                int f1Off = b1 * filt.info.strides[1];
+                const dim_t f1Off = b1 * filt.info.strides[1];
 
-                const size_t srcOffset = f0Off + f1Off + f2Off + f3Off;
-                memcpyBuffer(impulse, *filt.data, se_size, srcOffset);
+                const dim_t srcOffset = f0Off + f1Off + f2Off + f3Off;
+                kernel::memcopy(&impulse, mstrides, filt.data, mdims,
+                                filt.info.strides, srcOffset, 1);
                 p.impulse = &impulse;
 
                 p.o[0] = (p.outHasNoOffset ? 0 : b1);
diff --git a/src/backend/oneapi/kernel/convolve2.hpp b/src/backend/oneapi/kernel/convolve2.hpp
index 45bfa6c108..cf5f17ed1a 100644
--- a/src/backend/oneapi/kernel/convolve2.hpp
+++ b/src/backend/oneapi/kernel/convolve2.hpp
@@ -155,18 +155,21 @@ void conv2Helper(const conv_kparam_t<aT> &param, Param<T> out,
 template<typename T, typename aT>
 void conv2(conv_kparam_t<aT> &p, Param<T> &out, const Param<T> &sig,
            const Param<aT> &filt, const bool expand) {
-    size_t se_size = filt.info.dims[0] * filt.info.dims[1];
+    const dim_t se_size = filt.info.dims[0] * filt.info.dims[1];
     sycl::buffer<aT> impulse{sycl::range(se_size)};
-    int f0Off = filt.info.offset;
+    const dim_t mstrides[4] = {1, filt.info.dims[0], se_size, se_size};
+    const dim_t mdims[4]    = {filt.info.dims[0], filt.info.dims[1], 1, 1};
+    const dim_t f0Off       = filt.info.offset;
 
     for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) {
-        int f3Off = b3 * filt.info.strides[3];
+        const dim_t f3Off = b3 * filt.info.strides[3];
 
         for (int b2 = 0; b2 < filt.info.dims[2]; ++b2) {
-            int f2Off = b2 * filt.info.strides[2];
+            const dim_t f2Off = b2 * filt.info.strides[2];
 
-            const size_t srcOffset = f2Off + f3Off + f0Off;
-            memcpyBuffer(impulse, *filt.data, se_size, srcOffset);
+            const dim_t srcOffset = f2Off + f3Off + f0Off;
+            kernel::memcopy(&impulse, mstrides, filt.data, mdims,
+                            filt.info.strides, srcOffset, 2);
             p.impulse = &impulse;
 
             p.o[1] = (p.outHasNoOffset ? 0 : b2);
diff --git a/src/backend/oneapi/kernel/convolve3.hpp b/src/backend/oneapi/kernel/convolve3.hpp
index bdfcc4eb24..2b402ab32b 100644
--- a/src/backend/oneapi/kernel/convolve3.hpp
+++ b/src/backend/oneapi/kernel/convolve3.hpp
@@ -164,15 +164,21 @@ void conv3Helper(const conv_kparam_t<aT> &param, Param<T> &out,
 template<typename T, typename aT>
 void conv3(conv_kparam_t<aT> &p, Param<T> &out, const Param<T> &sig,
            const Param<aT> &filt, const bool expand) {
-    size_t se_size = filt.info.dims[0] * filt.info.dims[1] * filt.info.dims[2];
+    const dim_t se_size =
+        filt.info.dims[0] * filt.info.dims[1] * filt.info.dims[2];
     sycl::buffer<aT> impulse{sycl::range(se_size)};
-    int f0Off = filt.info.offset;
+    const dim_t mstrides[4] = {1, filt.info.dims[0],
+                               filt.info.dims[0] * filt.info.dims[1], se_size};
+    const dim_t mdims[4]    = {filt.info.dims[0], filt.info.dims[1],
+                               filt.info.dims[2], 1};
+    const dim_t f0Off       = filt.info.offset;
 
     for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) {
-        int f3Off = b3 * filt.info.strides[3];
+        const dim_t f3Off = b3 * filt.info.strides[3];
 
-        const size_t srcOffset = f3Off + f0Off;
-        memcpyBuffer(impulse, *filt.data, se_size, srcOffset);
+        const dim_t srcOffset = f3Off + f0Off;
+        kernel::memcopy(&impulse, mstrides, filt.data, mdims, filt.info.strides,
+                        srcOffset, 3);
         p.impulse = &impulse;
 
         p.o[2] = (p.outHasNoOffset ? 0 : b3);
diff --git a/src/backend/oneapi/kernel/convolve_separable.cpp b/src/backend/oneapi/kernel/convolve_separable.cpp
index 0f3dfacb30..5b9d1d6453 100644
--- a/src/backend/oneapi/kernel/convolve_separable.cpp
+++ b/src/backend/oneapi/kernel/convolve_separable.cpp
@@ -11,6 +11,7 @@
 #include <common/dispatch.hpp>
 #include <common/kernel_cache.hpp>
 #include <debug_oneapi.hpp>
+#include <kernel/memcopy.hpp>
 
 #include <string>
 #include <vector>
@@ -136,19 +137,6 @@ class convolveSeparableCreateKernel {
     sycl::local_accessor<T> localMem_;
 };
 
-template<typename T>
-void memcpyBuffer(sycl::buffer<T, 1> &dest, sycl::buffer<T, 1> &src,
-                  const size_t n, const size_t srcOffset) {
-    getQueue().submit([&](auto &h) {
-        sycl::accessor srcAcc{src, h, sycl::range{n}, sycl::id{srcOffset},
-                              sycl::read_only};
-        sycl::accessor destAcc{
-            dest,         h, sycl::range{n}, sycl::id{0}, sycl::write_only,
-            sycl::no_init};
-        h.copy(srcAcc, destAcc);
-    });
-}
-
 template<typename T, typename accType>
 void convSep(Param<T> out, const Param<T> signal, const Param<accType> filter,
              const int conv_dim, const bool expand) {
@@ -161,7 +149,7 @@ void convSep(Param<T> out, const Param<T> signal, const Param<accType> filter,
     constexpr int THREADS_X = 16;
     constexpr int THREADS_Y = 16;
 
-    const int fLen       = filter.info.dims[0] * filter.info.dims[1];
+    const dim_t fLen     = filter.info.dims[0] * filter.info.dims[1];
     const size_t C0_SIZE = (THREADS_X + 2 * (fLen - 1)) * THREADS_Y;
     const size_t C1_SIZE = (THREADS_Y + 2 * (fLen - 1)) * THREADS_X;
     size_t locSize       = (conv_dim == 0 ? C0_SIZE : C1_SIZE);
@@ -175,7 +163,10 @@ void convSep(Param<T> out, const Param<T> signal, const Param<accType> filter,
                               blk_y * signal.info.dims[3] * THREADS_Y);
 
     sycl::buffer<accType> mBuff = {sycl::range(fLen * sizeof(accType))};
-    memcpyBuffer(mBuff, *filter.data, fLen, 0);
+    const dim_t mstrides[4]     = {1, filter.info.dims[0], fLen, fLen};
+    const dim_t mdims[4] = {filter.info.dims[0], filter.info.dims[1], 1, 1};
+    kernel::memcopy(&mBuff, mstrides, filter.data, mdims, filter.info.strides,
+                    filter.info.offset, 2);
 
     getQueue().submit([&](auto &h) {
         sycl::accessor d_signal{*signal.data, h, sycl::read_only};
diff --git a/src/backend/opencl/Kernel.cpp b/src/backend/opencl/Kernel.cpp
index b5d818b6d2..e8286c5329 100644
--- a/src/backend/opencl/Kernel.cpp
+++ b/src/backend/opencl/Kernel.cpp
@@ -27,6 +27,55 @@ void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src,
     getQueue().enqueueCopyBuffer(*src, *dst, 0, 0, bytes);
 }
 
+void Kernel::copyToReadOnly(Kernel::DevPtrType dst, Kernel::DevPtrType src,
+                            size_t srcXInBytes, size_t bytes) {
+    getQueue().enqueueCopyBuffer(*src, *dst, srcXInBytes, 0, bytes);
+}
+
+void Kernel::copyToReadOnly2D(Kernel::DevPtrType dst, Kernel::DevPtrType src,
+                              size_t srcXInBytes, size_t srcPitchInBytes,
+                              size_t height, size_t widthInBytes) {
+    std::array<size_t, 3> src_origin = {srcXInBytes, 0, 0};
+    size_t src_row_pitch             = {srcPitchInBytes};
+    size_t src_slice_pitch           = {0};
+
+    std::array<size_t, 3> dst_origin = {0, 0, 0};
+    size_t dst_row_pitch             = {widthInBytes};
+    size_t dst_slice_pitch           = {0};
+
+    std::array<size_t, 3> region = {widthInBytes, height, 1};
+
+    // offset in bytes =
+    // src_origin[1]*src_row_pitch + src_origin[0]
+
+    getQueue().enqueueCopyBufferRect(*src, *dst, src_origin, dst_origin, region,
+                                     src_row_pitch, src_slice_pitch,
+                                     dst_row_pitch, dst_slice_pitch);
+}
+
+void Kernel::copyToReadOnly3D(Kernel::DevPtrType dst, Kernel::DevPtrType src,
+                              size_t srcXInBytes, size_t srcPitchInBytes,
+                              size_t srcHeight, size_t depth, size_t height,
+                              size_t widthInBytes) {
+    std::array<size_t, 3> src_origin = {srcXInBytes, 0, 0};
+    size_t src_row_pitch             = {srcPitchInBytes};
+    size_t src_slice_pitch           = {srcHeight * srcPitchInBytes};
+
+    std::array<size_t, 3> dst_origin = {0, 0, 0};
+    size_t dst_row_pitch             = {widthInBytes};
+    size_t dst_slice_pitch           = {height * widthInBytes};
+
+    std::array<size_t, 3> region = {widthInBytes, height, depth};
+
+    // offset in bytes =
+    // src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch +
+    // src_origin[0]
+
+    getQueue().enqueueCopyBufferRect(*src, *dst, src_origin, dst_origin, region,
+                                     src_row_pitch, src_slice_pitch,
+                                     dst_row_pitch, dst_slice_pitch);
+}
+
 void Kernel::setFlag(Kernel::DevPtrType dst, int* scalarValPtr,
                      const bool syncCopy) {
     UNUSED(syncCopy);
diff --git a/src/backend/opencl/Kernel.hpp b/src/backend/opencl/Kernel.hpp
index c5582d8f1c..9fc05ce3e1 100644
--- a/src/backend/opencl/Kernel.hpp
+++ b/src/backend/opencl/Kernel.hpp
@@ -54,6 +54,18 @@ class Kernel
 
     void copyToReadOnly(DevPtrType dst, DevPtrType src, size_t bytes) final;
 
+    void copyToReadOnly(DevPtrType dst, DevPtrType src, size_t srcXInBytes,
+                        size_t bytes) final;
+
+    void copyToReadOnly2D(DevPtrType dst, DevPtrType src, size_t srcXInBytes,
+                          size_t srcPitchInBytes, size_t height,
+                          size_t widthInBytes) final;
+
+    void copyToReadOnly3D(DevPtrType dst, DevPtrType src, size_t srcXInBytes,
+                          size_t srcPitchInBytes, size_t srcHeight,
+                          size_t depth, size_t height,
+                          size_t widthInBytes) final;
+
     void setFlag(DevPtrType dst, int* scalarValPtr,
                  const bool syncCopy = false) final;
 
diff --git a/src/backend/opencl/kernel/convolve/conv1.cpp b/src/backend/opencl/kernel/convolve/conv1.cpp
index 5bfa9668d6..d3f2f05f29 100644
--- a/src/backend/opencl/kernel/convolve/conv1.cpp
+++ b/src/backend/opencl/kernel/convolve/conv1.cpp
@@ -18,22 +18,16 @@ void conv1(conv_kparam_t& p, Param& out, const Param& sig, const Param& filt,
            const bool expand) {
     size_t se_size = filt.info.dims[0] * sizeof(aT);
     p.impulse      = bufferAlloc(se_size);
-    int f0Off      = filt.info.offset;
+    dim_t f0Off    = filt.info.offset;
 
     for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) {
-        int f3Off = b3 * filt.info.strides[3];
+        dim_t f3Off = b3 * filt.info.strides[3];
 
         for (int b2 = 0; b2 < filt.info.dims[2]; ++b2) {
-            int f2Off = b2 * filt.info.strides[2];
+            dim_t f2Off = b2 * filt.info.strides[2];
 
             for (int b1 = 0; b1 < filt.info.dims[1]; ++b1) {
-                int f1Off = b1 * filt.info.strides[1];
-
-                // FIXME: if the filter array is strided, direct copy of symbols
-                // might cause issues
-                getQueue().enqueueCopyBuffer(
-                    *filt.data, *p.impulse,
-                    (f0Off + f1Off + f2Off + f3Off) * sizeof(aT), 0, se_size);
+                dim_t f1Off = b1 * filt.info.strides[1];
 
                 p.o[0] = (p.outHasNoOffset ? 0 : b1);
                 p.o[1] = (p.outHasNoOffset ? 0 : b2);
@@ -42,7 +36,9 @@ void conv1(conv_kparam_t& p, Param& out, const Param& sig, const Param& filt,
                 p.s[1] = (p.inHasNoOffset ? 0 : b2);
                 p.s[2] = (p.inHasNoOffset ? 0 : b3);
 
-                convNHelper<T, aT>(p, out, sig, filt, 1, expand);
+                convNHelper<T, aT>(p, out, sig, filt,
+                                   (f0Off + f1Off + f2Off + f3Off) * sizeof(aT),
+                                   1, expand);
             }
         }
     }
diff --git a/src/backend/opencl/kernel/convolve/conv2_impl.hpp b/src/backend/opencl/kernel/convolve/conv2_impl.hpp
index 9798714750..12f2c2fdf8 100644
--- a/src/backend/opencl/kernel/convolve/conv2_impl.hpp
+++ b/src/backend/opencl/kernel/convolve/conv2_impl.hpp
@@ -18,7 +18,8 @@ namespace kernel {
 
 template<typename T, typename aT>
 void conv2Helper(const conv_kparam_t& param, Param out, const Param signal,
-                 const Param filter, const bool expand) {
+                 const Param filter, const size_t srcXInBytes,
+                 const bool expand) {
     using cl::EnqueueArgs;
     using cl::NDRange;
     using std::string;
@@ -27,8 +28,8 @@ void conv2Helper(const conv_kparam_t& param, Param out, const Param signal,
     constexpr bool IsComplex =
         std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value;
 
-    const int f0 = filter.info.dims[0];
-    const int f1 = filter.info.dims[1];
+    const dim_t f0 = filter.info.dims[0];
+    const dim_t f1 = filter.info.dims[1];
     const size_t LOC_SIZE =
         (THREADS_X + 2 * (f0 - 1)) * (THREADS_Y + 2 * (f1 - 1));
 
@@ -54,6 +55,17 @@ void conv2Helper(const conv_kparam_t& param, Param out, const Param signal,
     auto convolve = common::getKernel(
         "convolve", {{ops_cl_src, convolve_cl_src}}, tmpltArgs, compileOpts);
 
+    if (filter.info.strides[1] == f0) {
+        // 2D linear filter array
+        convolve.copyToReadOnly(param.impulse, filter.data, srcXInBytes,
+                                f0 * f1 * sizeof(aT));
+    } else {
+        // 2D strided filter array
+        convolve.copyToReadOnly2D(param.impulse, filter.data, srcXInBytes,
+                                  filter.info.strides[1] * sizeof(aT), f1,
+                                  f0 * sizeof(aT));
+    }
+
     convolve(EnqueueArgs(getQueue(), param.global, param.local), *out.data,
              out.info, *signal.data, signal.info, *param.impulse, filter.info,
              param.nBBS0, param.nBBS1, param.o[1], param.o[2], param.s[1],
@@ -65,26 +77,21 @@ void conv2(conv_kparam_t& p, Param& out, const Param& sig, const Param& filt,
            const bool expand) {
     size_t se_size = filt.info.dims[0] * filt.info.dims[1] * sizeof(aT);
     p.impulse      = bufferAlloc(se_size);
-    int f0Off      = filt.info.offset;
+    dim_t f0Off    = filt.info.offset;
 
     for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) {
-        int f3Off = b3 * filt.info.strides[3];
+        dim_t f3Off = b3 * filt.info.strides[3];
 
         for (int b2 = 0; b2 < filt.info.dims[2]; ++b2) {
-            int f2Off = b2 * filt.info.strides[2];
-
-            // FIXME: if the filter array is strided, direct copy of symbols
-            // might cause issues
-            getQueue().enqueueCopyBuffer(*filt.data, *p.impulse,
-                                         (f2Off + f3Off + f0Off) * sizeof(aT),
-                                         0, se_size);
+            dim_t f2Off = b2 * filt.info.strides[2];
 
             p.o[1] = (p.outHasNoOffset ? 0 : b2);
             p.o[2] = (p.outHasNoOffset ? 0 : b3);
             p.s[1] = (p.inHasNoOffset ? 0 : b2);
             p.s[2] = (p.inHasNoOffset ? 0 : b3);
 
-            conv2Helper<T, aT>(p, out, sig, filt, expand);
+            conv2Helper<T, aT>(p, out, sig, filt,
+                               (f2Off + f3Off + f0Off) * sizeof(aT), expand);
         }
     }
 }
diff --git a/src/backend/opencl/kernel/convolve/conv3.cpp b/src/backend/opencl/kernel/convolve/conv3.cpp
index 1383e8f443..4bc545f34f 100644
--- a/src/backend/opencl/kernel/convolve/conv3.cpp
+++ b/src/backend/opencl/kernel/convolve/conv3.cpp
@@ -23,15 +23,11 @@ void conv3(conv_kparam_t& p, Param& out, const Param& sig, const Param& filt,
 
     for (int b3 = 0; b3 < filt.info.dims[3]; ++b3) {
         int f3Off = b3 * filt.info.strides[3];
-        // FIXME: if the filter array is strided, direct copy of symbols
-        // might cause issues
-        getQueue().enqueueCopyBuffer(*filt.data, *p.impulse,
-                                     (f0Off + f3Off) * sizeof(aT), 0, se_size);
+        p.o[2]    = (p.outHasNoOffset ? 0 : b3);
+        p.s[2]    = (p.inHasNoOffset ? 0 : b3);
 
-        p.o[2] = (p.outHasNoOffset ? 0 : b3);
-        p.s[2] = (p.inHasNoOffset ? 0 : b3);
-
-        convNHelper<T, aT>(p, out, sig, filt, 3, expand);
+        convNHelper<T, aT>(p, out, sig, filt, (f0Off + f3Off) * sizeof(aT), 3,
+                           expand);
     }
 }
 
diff --git a/src/backend/opencl/kernel/convolve/conv_common.hpp b/src/backend/opencl/kernel/convolve/conv_common.hpp
index bd93419c7c..dbd96d3168 100644
--- a/src/backend/opencl/kernel/convolve/conv_common.hpp
+++ b/src/backend/opencl/kernel/convolve/conv_common.hpp
@@ -87,12 +87,17 @@ void prepareKernelArgs(conv_kparam_t& param, dim_t* oDims, const dim_t* fDims,
 
 template<typename T, typename aT>
 void convNHelper(const conv_kparam_t& param, Param& out, const Param& signal,
-                 const Param& filter, const int rank, const bool expand) {
+                 const Param& filter, const size_t srcXInBytes, const int rank,
+                 const bool expand) {
     using cl::EnqueueArgs;
     using cl::NDRange;
     using std::string;
     using std::vector;
 
+    const dim_t f0 = filter.info.dims[0];
+    const dim_t f1 = filter.info.dims[1];
+    const dim_t f2 = filter.info.dims[2];
+
     constexpr bool IsComplex =
         std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value;
 
@@ -117,6 +122,27 @@ void convNHelper(const conv_kparam_t& param, Param& out, const Param& signal,
     auto convolve = common::getKernel(
         "convolve", {{ops_cl_src, convolve_cl_src}}, tmpltArgs, compileOpts);
 
+    switch (rank) {
+        case 1:
+            convolve.copyToReadOnly(param.impulse, filter.data, srcXInBytes,
+                                    f0 * sizeof(aT));
+            break;
+        case 3:
+            if (filter.info.strides[2] == f0 * f1) {
+                // 3D linear filter array
+                convolve.copyToReadOnly(param.impulse, filter.data, srcXInBytes,
+                                        f0 * f1 * f2 * sizeof(aT));
+            } else {
+                // 3D strided filter array
+                convolve.copyToReadOnly3D(
+                    param.impulse, filter.data, srcXInBytes,
+                    filter.info.strides[1] * sizeof(aT),
+                    filter.info.strides[2] / filter.info.strides[1], f2, f1,
+                    f0 * sizeof(aT));
+            }
+            break;
+    }
+
     convolve(EnqueueArgs(getQueue(), param.global, param.local), *out.data,
              out.info, *signal.data, signal.info, cl::Local(param.loc_size),
              *param.impulse, filter.info, param.nBBS0, param.nBBS1, param.o[0],
diff --git a/src/backend/opencl/kernel/convolve_separable.cpp b/src/backend/opencl/kernel/convolve_separable.cpp
index 83a9116d72..7800e64ccf 100644
--- a/src/backend/opencl/kernel/convolve_separable.cpp
+++ b/src/backend/opencl/kernel/convolve_separable.cpp
@@ -76,9 +76,18 @@ void convSep(Param out, const Param signal, const Param filter,
                        blk_y * signal.info.dims[3] * THREADS_Y);
 
     cl::Buffer *mBuff = bufferAlloc(fLen * sizeof(accType));
-    // FIX ME: if the filter array is strided, direct might cause issues
-    getQueue().enqueueCopyBuffer(*filter.data, *mBuff, 0, 0,
-                                 fLen * sizeof(accType));
+    if (fLen == filter.info.strides[2]) {
+        // Linear 2D filter array
+        conv.copyToReadOnly(mBuff, filter.data,
+                            filter.info.offset * sizeof(accType),
+                            fLen * sizeof(accType));
+    } else {
+        // Strided 2D filter array
+        conv.copyToReadOnly2D(
+            mBuff, filter.data, filter.info.offset * sizeof(accType),
+            filter.info.strides[1] * sizeof(accType), filter.info.dims[1],
+            filter.info.dims[0] * sizeof(accType));
+    }
 
     conv(cl::EnqueueArgs(getQueue(), global, local), *out.data, out.info,
          *signal.data, signal.info, *mBuff, blk_x, blk_y);
diff --git a/src/backend/opencl/kernel/fast.cl b/src/backend/opencl/kernel/fast.cl
index ef80350f01..48d628fa2b 100644
--- a/src/backend/opencl/kernel/fast.cl
+++ b/src/backend/opencl/kernel/fast.cl
@@ -129,17 +129,18 @@ void load_shared_image(global const T* in, KParam iInfo,
                        unsigned lx, unsigned ly) {
     // Copy an image patch to shared memory, with a 3-pixel edge
     if (ix < lx && iy < ly && x - 3 < iInfo.dims[0] && y - 3 < iInfo.dims[1]) {
+        in += iInfo.offset;
         local_image[(ix) + (bx + 6) * (iy)] =
-            in[(x - 3) + iInfo.dims[0] * (y - 3)];
+            in[(x - 3) * iInfo.strides[0] + (y - 3) * iInfo.strides[1]];
         if (x + lx - 3 < iInfo.dims[0])
             local_image[(ix + lx) + (bx + 6) * (iy)] =
-                in[(x + lx - 3) + iInfo.dims[0] * (y - 3)];
+                in[(x + lx -3) * iInfo.strides[0] + (y - 3) * iInfo.strides[1]];
         if (y + ly - 3 < iInfo.dims[1])
             local_image[(ix) + (bx + 6) * (iy + ly)] =
-                in[(x - 3) + iInfo.dims[0] * (y + ly - 3)];
+                in[(x - 3) * iInfo.strides[0] + (y + ly - 3) * iInfo.strides[1]];
         if (x + lx - 3 < iInfo.dims[0] && y + ly - 3 < iInfo.dims[1])
             local_image[(ix + lx) + (bx + 6) * (iy + ly)] =
-                in[(x + lx - 3) + iInfo.dims[0] * (y + ly - 3)];
+                in[(x + lx - 3) * iInfo.strides[0] + (y + ly - 3) * iInfo.strides[1]];
     }
 }
 
@@ -155,7 +156,7 @@ kernel void locate_features(global const T* in, KParam iInfo,
     unsigned lx = bx / 2 + 3;
     unsigned ly = by / 2 + 3;
 
-    load_shared_image(in + iInfo.offset, iInfo, local_image, ix, iy, bx, by, x,
+    load_shared_image(in, iInfo, local_image, ix, iy, bx, by, x,
                       y, lx, ly);
     barrier(CLK_LOCAL_MEM_FENCE);
     locate_features_core(local_image, score, iInfo, thr, x, y, edge);
diff --git a/src/backend/opencl/kernel/harris.hpp b/src/backend/opencl/kernel/harris.hpp
index 835c20c745..e1e0ee5186 100644
--- a/src/backend/opencl/kernel/harris.hpp
+++ b/src/backend/opencl/kernel/harris.hpp
@@ -126,21 +126,20 @@ void harris(unsigned *corners_out, Param &x_out, Param &y_out, Param &resp_out,
 
     // Second order-derivatives kernel sizes
     const unsigned blk_x_so =
-        divup(in.info.dims[3] * in.info.strides[3], HARRIS_THREADS_PER_GROUP);
+        divup(in.info.dims[0] * in.info.dims[1], HARRIS_THREADS_PER_GROUP);
     const NDRange local_so(HARRIS_THREADS_PER_GROUP, 1);
     const NDRange global_so(blk_x_so * HARRIS_THREADS_PER_GROUP, 1);
 
     // Compute second-order derivatives
     soOp(EnqueueArgs(getQueue(), global_so, local_so), *ixx.get(), *ixy.get(),
-         *iyy.get(), in.info.dims[3] * in.info.strides[3], *ix.get(),
-         *iy.get());
+         *iyy.get(), in.info.dims[0] * in.info.dims[1], *ix.get(), *iy.get());
     CL_DEBUG_FINISH(getQueue());
 
     // Convolve second order derivatives with proper window filter
     conv_helper<T, convAccT>(ixx, ixy, iyy, filter);
 
     cl::Buffer *d_responses =
-        bufferAlloc(in.info.dims[3] * in.info.strides[3] * sizeof(T));
+        bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(T));
 
     // Harris responses kernel sizes
     unsigned blk_x_hr =
@@ -159,7 +158,7 @@ void harris(unsigned *corners_out, Param &x_out, Param &y_out, Param &resp_out,
 
     // Number of corners is not known a priori, limit maximum number of corners
     // according to image dimensions
-    unsigned corner_lim = in.info.dims[3] * in.info.strides[3] * 0.2f;
+    unsigned corner_lim = in.info.dims[0] * in.info.dims[1] * 0.2f;
 
     unsigned corners_found      = 0;
     cl::Buffer *d_corners_found = bufferAlloc(sizeof(unsigned));
@@ -221,7 +220,8 @@ void harris(unsigned *corners_out, Param &x_out, Param &y_out, Param &resp_out,
                 harris_idx.info.dims[k - 1] * harris_idx.info.strides[k - 1];
         }
 
-        int sort_elem = harris_resp.info.strides[3] * harris_resp.info.dims[3];
+        int sort_elem = harris_resp.info.dims[0] * harris_resp.info.dims[1];
+
         harris_resp.data = d_resp_corners;
         // Create indices using range
         harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned));
diff --git a/src/backend/opencl/kernel/orb.cl b/src/backend/opencl/kernel/orb.cl
index d8a31c81ec..be60db3b7b 100644
--- a/src/backend/opencl/kernel/orb.cl
+++ b/src/backend/opencl/kernel/orb.cl
@@ -128,6 +128,7 @@ kernel void harris_response(
     local float data[BLOCK_SIZE * BLOCK_SIZE];
 
     unsigned f = get_global_id(0);
+    image += iInfo.offset;
 
     unsigned x, y;
     float ixx = 0.f, iyy = 0.f, ixy = 0.f;
@@ -155,10 +156,10 @@ kernel void harris_response(
                 int j = k % block_size - r;
 
                 // Calculate local x and y derivatives
-                float ix = image[(x + i + 1) * iInfo.dims[0] + y + j] -
-                           image[(x + i - 1) * iInfo.dims[0] + y + j];
-                float iy = image[(x + i) * iInfo.dims[0] + y + j + 1] -
-                           image[(x + i) * iInfo.dims[0] + y + j - 1];
+                float ix = image[(x + i + 1) * iInfo.strides[1] + (y + j) * iInfo.strides[0]] -
+                           image[(x + i - 1) * iInfo.strides[1] + (y + j) * iInfo.strides[0]] ;
+                float iy = image[(x + i) * iInfo.strides[1] + (y + j + 1) * iInfo.strides[0]] -
+                           image[(x + i) * iInfo.strides[1] + (y + j - 1) * iInfo.strides[0]];
 
                 // Accumulate second order derivatives
                 ixx += ix * ix;
@@ -219,7 +220,7 @@ kernel void centroid_angle(global const float* x_in,
                 int j = k % patch_size - r;
 
                 // Calculate first order moments
-                T p = image[(x + i) * iInfo.dims[0] + y + j];
+                T p = image[(x + i) * iInfo.strides[1] + (y + j) * iInfo.strides[0] + iInfo.offset];
                 m01 += j * p;
                 m10 += i * p;
             }
@@ -246,7 +247,7 @@ inline T get_pixel(unsigned x, unsigned y, const float ori, const unsigned size,
     x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
     y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);
 
-    return image[x * iInfo.dims[0] + y];
+    return image[x * iInfo.strides[1] + y * iInfo.strides[0] + iInfo.offset];
 }
 
 kernel void extract_orb(global unsigned* desc_out, const unsigned n_feat,
diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp
index 5d4f523f16..63debc6df4 100644
--- a/src/backend/opencl/kernel/orb.hpp
+++ b/src/backend/opencl/kernel/orb.hpp
@@ -327,13 +327,14 @@ void orb(unsigned* out_feat, Param& x_out, Param& y_out, Param& score_out,
         Param lvl_tmp;
 
         if (blur_img) {
-            lvl_filt = lvl_img;
-            lvl_tmp  = lvl_img;
-
-            lvl_filt.data = bufferAlloc(lvl_filt.info.dims[0] *
-                                        lvl_filt.info.dims[1] * sizeof(T));
-            lvl_tmp.data  = bufferAlloc(lvl_tmp.info.dims[0] *
-                                        lvl_tmp.info.dims[1] * sizeof(T));
+            const dim_t pixels = lvl_img.info.dims[0] * lvl_img.info.dims[1];
+            lvl_filt.info = {{lvl_img.info.dims[0], lvl_img.info.dims[1], 1, 1},
+                             {1, lvl_img.info.dims[0], pixels, pixels},
+                             0};
+            lvl_filt.data = bufferAlloc(pixels * sizeof(T));
+
+            lvl_tmp.info = lvl_filt.info;
+            lvl_tmp.data = bufferAlloc(pixels * sizeof(T));
 
             // Calculate a separable Gaussian kernel
             if (h_gauss == nullptr) {
diff --git a/test/arrayfire_test.cpp b/test/arrayfire_test.cpp
index dedaedbf75..6803cc586d 100644
--- a/test/arrayfire_test.cpp
+++ b/test/arrayfire_test.cpp
@@ -105,17 +105,16 @@ std::string readNextNonEmptyLine(std::ifstream &file) {
 
 std::string getBackendName(bool lower) {
     af::Backend backend = af::getActiveBackend();
-    switch(backend) {
-    case AF_BACKEND_CPU:
-        return lower ? std::string("cpu") : std::string("CPU");
-    case AF_BACKEND_CUDA:
-        return lower ? std::string("cuda") : std::string("CUDA");
-    case AF_BACKEND_OPENCL:
-        return lower ? std::string("opencl") : std::string("OpenCL");
-    case AF_BACKEND_ONEAPI:
-        return lower ? std::string("oneapi") : std::string("oneAPI");
-    default:
-        return lower ? std::string("unknown") : std::string("Unknown");
+    switch (backend) {
+        case AF_BACKEND_CPU:
+            return lower ? std::string("cpu") : std::string("CPU");
+        case AF_BACKEND_CUDA:
+            return lower ? std::string("cuda") : std::string("CUDA");
+        case AF_BACKEND_OPENCL:
+            return lower ? std::string("opencl") : std::string("OpenCL");
+        case AF_BACKEND_ONEAPI:
+            return lower ? std::string("oneapi") : std::string("oneAPI");
+        default: return lower ? std::string("unknown") : std::string("Unknown");
     }
 }
 
@@ -2046,6 +2045,163 @@ INSTANTIATE(std::complex<float>);
 INSTANTIATE(std::complex<double>);
 #undef INSTANTIATE
 
+af::array toTempFormat(tempFormat form, const af::array &in) {
+    af::array ret;
+    const af::dim4 &dims = in.dims();
+    switch (form) {
+        case JIT_FORMAT:
+            switch (in.type()) {
+                case b8: ret = not(in); break;
+                default: ret = in * 2;
+            }
+            // Make sure that the base array is <> form original
+            ret.eval();
+            switch (in.type()) {
+                case b8: ret = not(ret); break;
+                default: ret /= 2;
+            }
+            break;
+        case SUB_FORMAT_dim0: {
+            af::dim4 pdims(dims);
+            pdims[0] += 2;
+            af::array parent = af::randu(pdims, in.type());
+            parent(af::seq(1, dims[0]), af::span, af::span, af::span) = in;
+            ret = parent(af::seq(1, dims[0]), af::span, af::span, af::span);
+        }; break;
+        case SUB_FORMAT_dim1: {
+            af::dim4 pdims(dims);
+            pdims[1] += 2;
+            af::array parent = af::randu(pdims, in.type());
+            parent(af::span, af::seq(1, dims[1]), af::span, af::span) = in;
+            ret = parent(af::span, af::seq(1, dims[1]), af::span, af::span);
+        }; break;
+        case SUB_FORMAT_dim2: {
+            af::dim4 pdims(dims);
+            pdims[2] += 2;
+            af::array parent = af::randu(pdims, in.type());
+            parent(af::span, af::span, af::seq(1, dims[2]), af::span) = in;
+            ret = parent(af::span, af::span, af::seq(1, dims[2]), af::span);
+        }; break;
+        case SUB_FORMAT_dim3: {
+            af::dim4 pdims(dims);
+            pdims[3] += 2;
+            af::array parent = af::randu(pdims, in.type());
+            parent(af::span, af::span, af::span, af::seq(1, dims[3])) = in;
+            ret = parent(af::span, af::span, af::span, af::seq(1, dims[3]));
+        }; break;
+        case REORDERED_FORMAT: {
+            const dim_t idxs[4] = {0, 3, 1, 2};
+            // idxs[0] has to be 0, to keep the same data in mem
+            dim_t rev_idxs[4];
+            for (dim_t i = 0; i < 4; ++i) { rev_idxs[idxs[i]] = i; };
+            ret = af::reorder(in, idxs[0], idxs[1], idxs[2], idxs[3]);
+            ret = ret.copy();  // make data linear
+            ret = af::reorder(ret, rev_idxs[0], rev_idxs[1], rev_idxs[2],
+                              rev_idxs[3]);
+            // ret has same content as in, although data is stored in
+            // different order
+        }; break;
+        case LINEAR_FORMAT:
+        default: ret = in.copy();
+    };
+    return ret;
+}
+
+void toTempFormat(tempFormat form, af_array *out, const af_array &in) {
+    dim_t dims[4];
+    af_get_dims(dims, dims + 1, dims + 2, dims + 3, in);
+    unsigned numdims;
+    af_get_numdims(&numdims, in);
+    af_dtype ty;
+    af_get_type(&ty, in);
+    switch (form) {
+        case JIT_FORMAT: {
+            // af_array one = nullptr, min_one = nullptr, res = nullptr;
+            af_array res = nullptr, two = nullptr;
+            ASSERT_SUCCESS(af_constant(&two, 2, numdims, dims, ty));
+            switch (ty) {
+                case b8: af_not(&res, in); break;
+                default:
+                    // ret = in + af::constant(1, dims, in.type());
+                    ASSERT_SUCCESS(af_mul(&res, in, two, false));
+            }
+            // Make sure that the base array is <> form original
+            ASSERT_SUCCESS(af_eval(res));
+            switch (ty) {
+                case b8: af_not(out, res); break;
+                default:
+                    ASSERT_SUCCESS(af_div(out, res, two, false));  // NO EVAL!!
+            }
+            ASSERT_SUCCESS(af_release_array(two));
+            two = nullptr;
+            ASSERT_SUCCESS(af_release_array(res));
+            res = nullptr;
+        }; break;
+        case SUB_FORMAT_dim0: {
+            const dim_t pdims[4] = {dims[0] + 2, dims[1], dims[2], dims[3]};
+            af_array parent      = nullptr;
+            ASSERT_SUCCESS(af_randu(&parent, std::max(1u, numdims), pdims, ty));
+            const af_seq idxs[4] = {af_make_seq(1, dims[0], 1), af_span,
+                                    af_span, af_span};
+
+            ASSERT_SUCCESS(af_assign_seq(out, parent, numdims, idxs, in));
+            ASSERT_SUCCESS(af_index(out, parent, numdims, idxs));
+            ASSERT_SUCCESS(af_release_array(parent));
+        }; break;
+        case SUB_FORMAT_dim1: {
+            const dim_t pdims[4] = {dims[0], dims[1] + 2, dims[2], dims[3]};
+            af_array parent      = nullptr;
+            ASSERT_SUCCESS(af_randu(&parent, std::max(2u, numdims), pdims, ty));
+            const af_seq idxs[4] = {af_span, af_make_seq(1, dims[1], 1),
+                                    af_span, af_span};
+            ASSERT_SUCCESS(af_assign_seq(out, parent, numdims, idxs, in));
+            ASSERT_SUCCESS(af_index(out, parent, numdims, idxs));
+            ASSERT_SUCCESS(af_release_array(parent));
+            parent = nullptr;
+        }; break;
+        case SUB_FORMAT_dim2: {
+            const dim_t pdims[4] = {dims[0], dims[1], dims[2] + 2, dims[3]};
+            af_array parent      = nullptr;
+            ASSERT_SUCCESS(af_randu(&parent, std::max(3u, numdims), pdims, ty));
+            const af_seq idxs[4] = {af_span, af_span,
+                                    af_make_seq(1, dims[2], 1), af_span};
+            ASSERT_SUCCESS(af_assign_seq(out, parent, numdims, idxs, in));
+            ASSERT_SUCCESS(af_index(out, parent, numdims, idxs));
+            ASSERT_SUCCESS(af_release_array(parent));
+            parent = nullptr;
+        }; break;
+        case SUB_FORMAT_dim3: {
+            const dim_t pdims[4] = {dims[0], dims[1], dims[2], dims[3] + 2};
+            af_array parent      = nullptr;
+            ASSERT_SUCCESS(af_randu(&parent, std::max(4u, numdims), pdims, ty));
+            const af_seq idxs[4] = {af_span, af_span, af_span,
+                                    af_make_seq(1, dims[3], 1)};
+            ASSERT_SUCCESS(af_assign_seq(out, parent, numdims, idxs, in));
+            ASSERT_SUCCESS(af_index(out, parent, numdims, idxs));
+            ASSERT_SUCCESS(af_release_array(parent));
+            parent = nullptr;
+        }; break;
+        case REORDERED_FORMAT: {
+            const unsigned idxs[4] = {0, 3, 1, 2};
+            // idxs[0] has to be 0, to keep the same data in mem
+            dim_t rev_idxs[4];
+            for (dim_t i = 0; i < 4; ++i) { rev_idxs[idxs[i]] = i; };
+            af_array rev = nullptr;
+            ASSERT_SUCCESS(
+                af_reorder(&rev, in, idxs[0], idxs[1], idxs[2], idxs[3]));
+            ASSERT_SUCCESS(af_copy_array(out, rev));
+            ASSERT_SUCCESS(af_reorder(out, rev, rev_idxs[0], rev_idxs[1],
+                                      rev_idxs[2], rev_idxs[3]));
+            // ret has same content as in, although data is stored in
+            // different order
+            ASSERT_SUCCESS(af_release_array(rev));
+            rev = nullptr;
+        }; break;
+        case LINEAR_FORMAT:
+        default: af_copy_array(out, in);
+    };
+}
+
 int main(int argc, char **argv) {
     ::testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();
diff --git a/test/convolve.cpp b/test/convolve.cpp
index 5df8961e1b..2a9c733d59 100644
--- a/test/convolve.cpp
+++ b/test/convolve.cpp
@@ -1178,8 +1178,86 @@ TEST(ConvolveNN, ZeroPadding_Issue2817) {
               true);
 
     array incoming_gradient = constant(1 / 9.f, 3, 3);
-    array convolved_grad = convolve2GradientNN(incoming_gradient, signal, filter,
-                                               convolved, strides, padding, dilation,
-                                               AF_CONV_GRADIENT_FILTER);
+    array convolved_grad    = convolve2GradientNN(
+        incoming_gradient, signal, filter, convolved, strides, padding,
+        dilation, AF_CONV_GRADIENT_FILTER);
     ASSERT_EQ(sum<float>(abs(convolved - convolved_grad)) < 1E-5, true);
 }
+
+#define TESTS_TEMP_FORMATS(form)                                               \
+    TEST(TEMP_FORMAT, form##_1) {                                              \
+        vector<dim4> numDims;                                                  \
+        vector<vector<float>> in;                                              \
+        vector<vector<float>> tests;                                           \
+                                                                               \
+        readTests<float, float, int>(                                          \
+            string(TEST_DIR "/convolve/vector_same.test"), numDims, in,        \
+            tests);                                                            \
+        array signal(numDims[0], &(in[0].front()));                            \
+        array filter(numDims[1], &(in[1].front()));                            \
+                                                                               \
+        array out  = convolve1(toTempFormat(form, signal),                     \
+                               toTempFormat(form, filter), AF_CONV_DEFAULT);   \
+        array gold = convolve1(signal, filter, AF_CONV_DEFAULT);               \
+                                                                               \
+        EXPECT_ARRAYS_EQ(out, gold);                                           \
+    }                                                                          \
+                                                                               \
+    TEST(TEMP_FORMAT, form##_2) {                                              \
+        vector<dim4> numDims;                                                  \
+        vector<vector<float>> in;                                              \
+        vector<vector<float>> tests;                                           \
+                                                                               \
+        readTests<float, float, int>(                                          \
+            string(TEST_DIR "/convolve/rectangle_same_one2many.test"),         \
+            numDims, in, tests);                                               \
+        array signal(numDims[0], &(in[0].front()));                            \
+        array filter(numDims[1], &(in[1].front()));                            \
+                                                                               \
+        array out  = convolve2(toTempFormat(form, signal),                     \
+                               toTempFormat(form, filter), AF_CONV_DEFAULT);   \
+        array gold = convolve2(signal, filter, AF_CONV_DEFAULT);               \
+                                                                               \
+        EXPECT_ARRAYS_EQ(out, gold);                                           \
+    }                                                                          \
+                                                                               \
+    TEST(TEMP_FORMAT, form##_3) {                                              \
+        vector<dim4> numDims;                                                  \
+        vector<vector<float>> in;                                              \
+        vector<vector<float>> tests;                                           \
+                                                                               \
+        readTests<float, float, int>(                                          \
+            string(TEST_DIR "/convolve/cuboid_same_many2many.test"), numDims,  \
+            in, tests);                                                        \
+        array signal(numDims[0], &(in[0].front()));                            \
+        array filter(numDims[1], &(in[1].front()));                            \
+                                                                               \
+        array out  = convolve3(toTempFormat(form, signal),                     \
+                               toTempFormat(form, filter), AF_CONV_DEFAULT);   \
+        array gold = convolve3(signal, filter, AF_CONV_DEFAULT);               \
+                                                                               \
+        EXPECT_ARRAYS_EQ(out, gold);                                           \
+    }                                                                          \
+                                                                               \
+    TEST(TEMP_FORMAT, form##_separable) {                                      \
+        vector<dim4> numDims;                                                  \
+        vector<vector<float>> in;                                              \
+        vector<vector<float>> tests;                                           \
+                                                                               \
+        readTests<float, float, int>(                                          \
+            string(TEST_DIR                                                    \
+                   "/convolve/separable_conv2d_same_rectangle_batch.test"),    \
+            numDims, in, tests);                                               \
+        array signal(numDims[0], &(in[0].front()));                            \
+        array cFilter(numDims[1], &(in[1].front()));                           \
+        array rFilter(numDims[2], &(in[2].front()));                           \
+                                                                               \
+        array out =                                                            \
+            convolve(toTempFormat(form, cFilter), toTempFormat(form, rFilter), \
+                     toTempFormat(form, signal), AF_CONV_DEFAULT);             \
+        array gold = convolve(cFilter, rFilter, signal, AF_CONV_DEFAULT);      \
+                                                                               \
+        EXPECT_ARRAYS_EQ(out, gold);                                           \
+    }
+
+FOREACH_TEMP_FORMAT(TESTS_TEMP_FORMATS)
diff --git a/test/fast.cpp b/test/fast.cpp
index c5e3225d0e..9114d98b7f 100644
--- a/test/fast.cpp
+++ b/test/fast.cpp
@@ -239,3 +239,41 @@ TEST(FloatFAST, CPP) {
     delete[] outOrientation;
     delete[] outSize;
 }
+
+#define TESTS_TEMP_FORMAT(form)                                               \
+    TEST(TEMP_FORMAT, form) {                                                 \
+        UNSUPPORTED_BACKEND(AF_BACKEND_ONEAPI);                               \
+        IMAGEIO_ENABLED_CHECK();                                              \
+                                                                              \
+        vector<dim4> inDims;                                                  \
+        vector<string> inFiles;                                               \
+        vector<vector<float>> gold;                                           \
+                                                                              \
+        readImageTests(string(TEST_DIR "/fast/square_nonmax_float.test"),     \
+                       inDims, inFiles, gold);                                \
+        inFiles[0].insert(0, string(TEST_DIR "/fast/"));                      \
+        array in = loadImage(inFiles[0].c_str(), false);                      \
+                                                                              \
+        features feat =                                                       \
+            fast(toTempFormat(form, in), 20.0f, 9, true, 0.05f, 3);           \
+        features gfeat = fast(in, 20.0f, 9, true, 0.05f, 3);                  \
+                                                                              \
+        /* The results from fast are dependent on threads runtime, so sort by \
+         * very simple hash on all columns of feat before comparing */        \
+        array score = (feat.getX() * inDims[0].dims[1] + feat.getY());        \
+        array idx, score_sorted;                                              \
+        sort(score_sorted, idx, score);                                       \
+                                                                              \
+        array gscore = (gfeat.getX() * inDims[0].dims[1] + gfeat.getY());     \
+        array gidx, gscore_sorted;                                            \
+        sort(gscore_sorted, gidx, gscore);                                    \
+                                                                              \
+        EXPECT_ARRAYS_EQ(feat.getX()(idx), gfeat.getX()(gidx));               \
+        EXPECT_ARRAYS_EQ(feat.getY()(idx), gfeat.getY()(gidx));               \
+        EXPECT_ARRAYS_EQ(feat.getScore()(idx), gfeat.getScore()(gidx));       \
+        EXPECT_ARRAYS_EQ(feat.getOrientation()(idx),                          \
+                         gfeat.getOrientation()(gidx));                       \
+        EXPECT_ARRAYS_EQ(feat.getSize()(idx), gfeat.getSize()(gidx));         \
+    }
+
+FOREACH_TEMP_FORMAT(TESTS_TEMP_FORMAT)
diff --git a/test/harris.cpp b/test/harris.cpp
index f2fd27d47a..3420edbba4 100644
--- a/test/harris.cpp
+++ b/test/harris.cpp
@@ -220,3 +220,43 @@ TEST(FloatHarris, CPP) {
             << "at: " << elIter << endl;
     }
 }
+
+#define TESTS_TEMP_FORMAT(form)                                                \
+    TEST(TEMP_FORMAT, form) {                                                  \
+        UNSUPPORTED_BACKEND(AF_BACKEND_ONEAPI);                                \
+        IMAGEIO_ENABLED_CHECK();                                               \
+                                                                               \
+        constexpr int MAX_CORNERS = 500;                                       \
+                                                                               \
+        vector<dim4> inDims;                                                   \
+        vector<string> inFiles;                                                \
+        vector<vector<float>> gold;                                            \
+                                                                               \
+        readImageTests(string(TEST_DIR "/harris/square_0_3.test"), inDims,     \
+                       inFiles, gold);                                         \
+        inFiles[0].insert(0, string(TEST_DIR "/harris/"));                     \
+        array in = loadImage(inFiles[0].c_str(), false);                       \
+                                                                               \
+        features out =                                                         \
+            harris(toTempFormat(form, in), MAX_CORNERS, 1e5f, 0.0f, 3, 0.04f); \
+        features gout = harris(in, MAX_CORNERS, 1e5f, 0.0f, 3, 0.04f);         \
+                                                                               \
+        ASSERT_GT(MAX_CORNERS, out.getNumFeatures());                          \
+                                                                               \
+        array score = out.getX() * in.dims().dims[1] + out.getY();             \
+        array idx, score_sorted;                                               \
+        sort(score_sorted, idx, score);                                        \
+                                                                               \
+        array gscore = gout.getX() * in.dims().dims[1] + gout.getY();          \
+        array gidx, gscore_sorted;                                             \
+        sort(gscore_sorted, gidx, gscore);                                     \
+                                                                               \
+        EXPECT_ARRAYS_EQ(out.getX()(idx), gout.getX()(gidx));                  \
+        EXPECT_ARRAYS_EQ(out.getY()(idx), gout.getY()(gidx));                  \
+        EXPECT_ARRAYS_EQ(out.getOrientation()(idx),                            \
+                         gout.getOrientation()(gidx));                         \
+        EXPECT_ARRAYS_EQ(out.getScore()(idx), gout.getScore()(gidx));          \
+        EXPECT_ARRAYS_EQ(out.getSize()(idx), gout.getSize()(gidx));            \
+    };
+
+FOREACH_TEMP_FORMAT(TESTS_TEMP_FORMAT)
\ No newline at end of file
diff --git a/test/orb.cpp b/test/orb.cpp
index 3ace1f4b05..25a4d13eee 100644
--- a/test/orb.cpp
+++ b/test/orb.cpp
@@ -326,3 +326,59 @@ TEST(ORB, CPP) {
     delete[] outSize;
     delete[] outDesc;
 }
+
+#define TEST_TEMP_FORMATS(form)                                               \
+    TEST(TEMP_FORMAT, form) {                                                 \
+        UNSUPPORTED_BACKEND(AF_BACKEND_ONEAPI);                               \
+        IMAGEIO_ENABLED_CHECK();                                              \
+                                                                              \
+        constexpr size_t MAX_FEATURES = 400;                                  \
+                                                                              \
+        vector<dim4> inDims;                                                  \
+        vector<string> inFiles;                                               \
+        vector<vector<float>> goldFeat;                                       \
+        vector<vector<unsigned>> goldDesc;                                    \
+                                                                              \
+        readImageFeaturesDescriptors<unsigned>(                               \
+            string(TEST_DIR "/orb/square.test"), inDims, inFiles, goldFeat,   \
+            goldDesc);                                                        \
+        inFiles[0].insert(0, string(TEST_DIR "/orb/"));                       \
+        array in = loadImage(inFiles[0].c_str(), false);                      \
+                                                                              \
+        features feat;                                                        \
+        array desc;                                                           \
+        orb(feat, desc, toTempFormat(form, in), 20.0f, MAX_FEATURES, 1.2f, 8, \
+            true);                                                            \
+        features gfeat;                                                       \
+        array gdesc;                                                          \
+        orb(gfeat, gdesc, in, 20.0f, MAX_FEATURES, 1.2f, 8, true);            \
+                                                                              \
+        /* The clipping of the features is dependent on threads runtime, so   \
+         * capture them all. */                                               \
+        ASSERT_GT(MAX_FEATURES, feat.getNumFeatures())                        \
+            << "Please increase MAX_FEATURES to capture all features";        \
+                                                                              \
+        /* The results from orb are dependent on threads runtime, so sort     \
+         * by very simple hash on all columns of feat before comparing */     \
+        array score = (feat.getX() * inDims[0].dims[1] + feat.getY()) *       \
+                      feat.getScore() * feat.getOrientation() *               \
+                      feat.getSize();                                         \
+        array idx, score_sorted;                                              \
+        sort(score_sorted, idx, score);                                       \
+                                                                              \
+        array gscore = (gfeat.getX() * inDims[0].dims[1] + gfeat.getY()) *    \
+                       gfeat.getScore() * gfeat.getOrientation() *            \
+                       gfeat.getSize();                                       \
+        array gidx, gscore_sorted;                                            \
+        sort(gscore_sorted, gidx, gscore);                                    \
+                                                                              \
+        EXPECT_ARRAYS_EQ(feat.getX()(idx), gfeat.getX()(gidx));               \
+        EXPECT_ARRAYS_EQ(feat.getY()(idx), gfeat.getY()(gidx));               \
+        EXPECT_ARRAYS_EQ(feat.getScore()(idx), gfeat.getScore()(gidx));       \
+        EXPECT_ARRAYS_EQ(feat.getOrientation()(idx),                          \
+                         gfeat.getOrientation()(gidx));                       \
+        EXPECT_ARRAYS_EQ(feat.getSize()(idx), gfeat.getSize()(gidx));         \
+        EXPECT_ARRAYS_EQ(desc(af::span, idx), gdesc(af::span, gidx));         \
+    }
+
+FOREACH_TEMP_FORMAT(TEST_TEMP_FORMATS)
diff --git a/test/testHelpers.hpp b/test/testHelpers.hpp
index 5f6b02b5a4..405f23309d 100644
--- a/test/testHelpers.hpp
+++ b/test/testHelpers.hpp
@@ -244,10 +244,10 @@ bool noHalfTests(af::dtype ty);
     GTEST_SKIP() << "Device doesn't support Half"
 
 #ifdef SKIP_UNSUPPORTED_TESTS
-#define UNSUPPORTED_BACKEND(backend)                        \
-    if(backend == af::getActiveBackend())                   \
-        GTEST_SKIP() << "Skipping unsupported function on " \
-                        + getBackendName() + " backend"
+#define UNSUPPORTED_BACKEND(backend)                                         \
+    if (backend == af::getActiveBackend())                                   \
+    GTEST_SKIP() << "Skipping unsupported function on " + getBackendName() + \
+                        " backend"
 #else
 #define UNSUPPORTED_BACKEND(backend)
 #endif
@@ -653,6 +653,30 @@ ::testing::AssertionResult assertArrayEq(std::string aName, std::string bName,
                                          const af_array a, const af_array b,
                                          TestOutputArrayInfo *metadata);
 
+enum tempFormat {
+    LINEAR_FORMAT,    // Linear array (= default)
+    JIT_FORMAT,       // Array which has JIT operations outstanding
+    SUB_FORMAT_dim0,  // Array where only a subset is allocated for dim0
+    SUB_FORMAT_dim1,  // Array where only a subset is allocated for dim1
+    SUB_FORMAT_dim2,  // Array where only a subset is allocated for dim2
+    SUB_FORMAT_dim3,  // Array where only a subset is allocated for dim3
+    REORDERED_FORMAT  // Array where the dimensions are reordered
+};
+// Calls the function fn for all available formats
+#define FOREACH_TEMP_FORMAT(TESTS) \
+    TESTS(LINEAR_FORMAT)           \
+    TESTS(JIT_FORMAT)              \
+    TESTS(SUB_FORMAT_dim0)         \
+    TESTS(SUB_FORMAT_dim1)         \
+    TESTS(SUB_FORMAT_dim2)         \
+    TESTS(SUB_FORMAT_dim3)         \
+    TESTS(REORDERED_FORMAT)
+
+// formats the "in" array according to provided format.  The content remains
+// unchanged.
+af::array toTempFormat(tempFormat form, const af::array &in);
+void toTempFormat(tempFormat form, af_array *out, const af_array &in);
+
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif