Fixes sub-array (cpu, cuda, opencl, oneapi) support for orb

willyborn · willyborn · commit ad1413ff8ada · 2025-06-28T11:44:42.000+02:00
diff --git a/src/backend/cpu/kernel/orb.hpp b/src/backend/cpu/kernel/orb.hpp
@@ -119,8 +119,9 @@ void harris_response(float* x_out, float* y_out, float* score_out,
                      unsigned* usable_feat, CParam<T> image,
                      const unsigned block_size, const float k_thr,
                      const unsigned patch_size) {
-    const af::dim4 idims = image.dims();
-    const T* image_ptr   = image.get();
+    const af::dim4 idims    = image.dims();
+    const af::dim4 istrides = image.strides();
+    const T* image_ptr      = image.get();
     for (unsigned f = 0; f < total_feat; f++) {
         unsigned x, y;
         float scl = 1.f;
@@ -154,10 +155,12 @@ void harris_response(float* x_out, float* y_out, float* score_out,
             int j = k % block_size - r;
 
             // Calculate local x and y derivatives
-            float ix = image_ptr[(x + i + 1) * idims[0] + y + j] -
-                       image_ptr[(x + i - 1) * idims[0] + y + j];
-            float iy = image_ptr[(x + i) * idims[0] + y + j + 1] -
-                       image_ptr[(x + i) * idims[0] + y + j - 1];
+            float ix =
+                image_ptr[(x + i + 1) * istrides[1] + (y + j) * istrides[0]] -
+                image_ptr[(x + i - 1) * istrides[1] + (y + j) * istrides[0]];
+            float iy =
+                image_ptr[(x + i) * istrides[1] + (y + j + 1) * istrides[0]] -
+                image_ptr[(x + i) * istrides[1] + (y + j - 1) * istrides[0]];
 
             // Accumulate second order derivatives
             ixx += ix * ix;
@@ -189,8 +192,9 @@ template<typename T>
 void centroid_angle(const float* x_in, const float* y_in,
                     float* orientation_out, const unsigned total_feat,
                     CParam<T> image, const unsigned patch_size) {
-    const af::dim4 idims = image.dims();
-    const T* image_ptr   = image.get();
+    const af::dim4 idims    = image.dims();
+    const af::dim4 istrides = image.strides();
+    const T* image_ptr      = image.get();
     for (unsigned f = 0; f < total_feat; f++) {
         unsigned x = (unsigned)round(x_in[f]);
         unsigned y = (unsigned)round(y_in[f]);
@@ -205,7 +209,7 @@ void centroid_angle(const float* x_in, const float* y_in,
             int j = k % patch_size - r;
 
             // Calculate first order moments
-            T p = image_ptr[(x + i) * idims[0] + y + j];
+            T p = image_ptr[(x + i) * istrides[1] + (y + j) * istrides[0]];
             m01 += j * p;
             m10 += i * p;
         }
@@ -219,17 +223,17 @@ template<typename T>
 inline T get_pixel(unsigned x, unsigned y, const float ori, const unsigned size,
                    const int dist_x, const int dist_y, CParam<T> image,
                    const unsigned patch_size) {
-    const af::dim4 idims = image.dims();
-    const T* image_ptr   = image.get();
-    float ori_sin        = sin(ori);
-    float ori_cos        = cos(ori);
-    float patch_scl      = (float)size / (float)patch_size;
+    const af::dim4 istrides = image.strides();
+    const T* image_ptr      = image.get();
+    float ori_sin           = sin(ori);
+    float ori_cos           = cos(ori);
+    float patch_scl         = (float)size / (float)patch_size;
 
     // Calculate point coordinates based on orientation and size
     x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
     y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);
 
-    return image_ptr[x * idims[0] + y];
+    return image_ptr[x * istrides[1] + y * istrides[0]];
 }
 
 template<typename T>
diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
@@ -125,10 +125,14 @@ __global__ void harris_response(float* score_out, float* size_out,
             int j = k % block_size - r;
 
             // Calculate local x and y derivatives
-            float ix = image.ptr[(x + i + 1) * image.dims[0] + y + j] -
-                       image.ptr[(x + i - 1) * image.dims[0] + y + j];
-            float iy = image.ptr[(x + i) * image.dims[0] + y + j + 1] -
-                       image.ptr[(x + i) * image.dims[0] + y + j - 1];
+            float ix = image.ptr[(x + i + 1) * image.strides[1] +
+                                 (y + j) * image.strides[0]] -
+                       image.ptr[(x + i - 1) * image.strides[1] +
+                                 (y + j) * image.strides[0]];
+            float iy = image.ptr[(x + i) * image.strides[1] +
+                                 (y + j + 1) * image.strides[0]] -
+                       image.ptr[(x + i) * image.strides[1] +
+                                 (y + j - 1) * image.strides[0]];
 
             // Accumulate second order derivatives
             ixx += ix * ix;
@@ -181,7 +185,8 @@ __global__ void centroid_angle(const float* x_in, const float* y_in,
             int j = k % patch_size - r;
 
             // Calculate first order moments
-            T p = image.ptr[(x + i) * image.dims[0] + y + j];
+            T p = image.ptr[(x + i) * image.strides[1] +
+                            (y + j) * image.strides[0]];
             m01 += j * p;
             m10 += i * p;
         }
@@ -209,7 +214,7 @@ inline __device__ T get_pixel(unsigned x, unsigned y, const float ori,
     x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
     y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);
 
-    return image.ptr[x * image.dims[0] + y];
+    return image.ptr[x * image.strides[1] + y * image.strides[0]];
 }
 
 inline __device__ int lookup(const int n, cudaTextureObject_t tex) {
diff --git a/src/backend/opencl/kernel/orb.cl b/src/backend/opencl/kernel/orb.cl
@@ -128,6 +128,7 @@ kernel void harris_response(
     local float data[BLOCK_SIZE * BLOCK_SIZE];
 
     unsigned f = get_global_id(0);
+    image += iInfo.offset;
 
     unsigned x, y;
     float ixx = 0.f, iyy = 0.f, ixy = 0.f;
@@ -155,10 +156,10 @@ kernel void harris_response(
                 int j = k % block_size - r;
 
                 // Calculate local x and y derivatives
-                float ix = image[(x + i + 1) * iInfo.dims[0] + y + j] -
-                           image[(x + i - 1) * iInfo.dims[0] + y + j];
-                float iy = image[(x + i) * iInfo.dims[0] + y + j + 1] -
-                           image[(x + i) * iInfo.dims[0] + y + j - 1];
+                float ix = image[(x + i + 1) * iInfo.strides[1] + (y + j) * iInfo.strides[0]] -
+                           image[(x + i - 1) * iInfo.strides[1] + (y + j) * iInfo.strides[0]] ;
+                float iy = image[(x + i) * iInfo.strides[1] + (y + j + 1) * iInfo.strides[0]] -
+                           image[(x + i) * iInfo.strides[1] + (y + j - 1) * iInfo.strides[0]];
 
                 // Accumulate second order derivatives
                 ixx += ix * ix;
@@ -219,7 +220,7 @@ kernel void centroid_angle(global const float* x_in,
                 int j = k % patch_size - r;
 
                 // Calculate first order moments
-                T p = image[(x + i) * iInfo.dims[0] + y + j];
+                T p = image[(x + i) * iInfo.strides[1] + (y + j) * iInfo.strides[0] + iInfo.offset];
                 m01 += j * p;
                 m10 += i * p;
             }
@@ -246,7 +247,7 @@ inline T get_pixel(unsigned x, unsigned y, const float ori, const unsigned size,
     x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
     y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);
 
-    return image[x * iInfo.dims[0] + y];
+    return image[x * iInfo.strides[1] + y * iInfo.strides[0] + iInfo.offset];
 }
 
 kernel void extract_orb(global unsigned* desc_out, const unsigned n_feat,
diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp
@@ -327,13 +327,14 @@ void orb(unsigned* out_feat, Param& x_out, Param& y_out, Param& score_out,
         Param lvl_tmp;
 
         if (blur_img) {
-            lvl_filt = lvl_img;
-            lvl_tmp  = lvl_img;
-
-            lvl_filt.data = bufferAlloc(lvl_filt.info.dims[0] *
-                                        lvl_filt.info.dims[1] * sizeof(T));
-            lvl_tmp.data  = bufferAlloc(lvl_tmp.info.dims[0] *
-                                        lvl_tmp.info.dims[1] * sizeof(T));
+            const dim_t pixels = lvl_img.info.dims[0] * lvl_img.info.dims[1];
+            lvl_filt.info = {{lvl_img.info.dims[0], lvl_img.info.dims[1], 1, 1},
+                             {1, lvl_img.info.dims[0], pixels, pixels},
+                             0};
+            lvl_filt.data = bufferAlloc(pixels * sizeof(T));
+
+            lvl_tmp.info = lvl_filt.info;
+            lvl_tmp.data = bufferAlloc(pixels * sizeof(T));
 
             // Calculate a separable Gaussian kernel
             if (h_gauss == nullptr) {
diff --git a/test/orb.cpp b/test/orb.cpp
@@ -326,3 +326,59 @@ TEST(ORB, CPP) {
     delete[] outSize;
     delete[] outDesc;
 }
+
+#define TEST_TEMP_FORMATS(form)                                               \
+    TEST(TEMP_FORMAT, form) {                                                 \
+        UNSUPPORTED_BACKEND(AF_BACKEND_ONEAPI);                               \
+        IMAGEIO_ENABLED_CHECK();                                              \
+                                                                              \
+        constexpr size_t MAX_FEATURES = 400;                                  \
+                                                                              \
+        vector<dim4> inDims;                                                  \
+        vector<string> inFiles;                                               \
+        vector<vector<float>> goldFeat;                                       \
+        vector<vector<unsigned>> goldDesc;                                    \
+                                                                              \
+        readImageFeaturesDescriptors<unsigned>(                               \
+            string(TEST_DIR "/orb/square.test"), inDims, inFiles, goldFeat,   \
+            goldDesc);                                                        \
+        inFiles[0].insert(0, string(TEST_DIR "/orb/"));                       \
+        array in = loadImage(inFiles[0].c_str(), false);                      \
+                                                                              \
+        features feat;                                                        \
+        array desc;                                                           \
+        orb(feat, desc, toTempFormat(form, in), 20.0f, MAX_FEATURES, 1.2f, 8, \
+            true);                                                            \
+        features gfeat;                                                       \
+        array gdesc;                                                          \
+        orb(gfeat, gdesc, in, 20.0f, MAX_FEATURES, 1.2f, 8, true);            \
+                                                                              \
+        /* The clipping of the features is dependent on threads runtime, so   \
+         * capture them all. */                                               \
+        ASSERT_GT(MAX_FEATURES, feat.getNumFeatures())                        \
+            << "Please increase MAX_FEATURES to capture all features";        \
+                                                                              \
+        /* The results from orb are dependent on threads runtime, so sort     \
+         * by very simple hash on all columns of feat before comparing */     \
+        array score = (feat.getX() * inDims[0].dims[1] + feat.getY()) *       \
+                      feat.getScore() * feat.getOrientation() *               \
+                      feat.getSize();                                         \
+        array idx, score_sorted;                                              \
+        sort(score_sorted, idx, score);                                       \
+                                                                              \
+        array gscore = (gfeat.getX() * inDims[0].dims[1] + gfeat.getY()) *    \
+                       gfeat.getScore() * gfeat.getOrientation() *            \
+                       gfeat.getSize();                                       \
+        array gidx, gscore_sorted;                                            \
+        sort(gscore_sorted, gidx, gscore);                                    \
+                                                                              \
+        EXPECT_ARRAYS_EQ(feat.getX()(idx), gfeat.getX()(gidx));               \
+        EXPECT_ARRAYS_EQ(feat.getY()(idx), gfeat.getY()(gidx));               \
+        EXPECT_ARRAYS_EQ(feat.getScore()(idx), gfeat.getScore()(gidx));       \
+        EXPECT_ARRAYS_EQ(feat.getOrientation()(idx),                          \
+                         gfeat.getOrientation()(gidx));                       \
+        EXPECT_ARRAYS_EQ(feat.getSize()(idx), gfeat.getSize()(gidx));         \
+        EXPECT_ARRAYS_EQ(desc(af::span, idx), gdesc(af::span, gidx));         \
+    }
+
+FOREACH_TEMP_FORMAT(TEST_TEMP_FORMATS)