From 8dc49a2b81c112f6acc5dc0441652fbf3799f99f Mon Sep 17 00:00:00 2001
From: Michael Platings <michael.platings@arm.com>
Date: Tue, 14 Jan 2025 15:49:21 +0000
Subject: [PATCH 1/2] Add support for constant border to Warp Perspective

---
 CHANGELOG.md                                  |   2 +-
 doc/functionality.md                          |   8 +-
 doc/opencv.md                                 |   2 +-
 kleidicv/include/kleidicv/kleidicv.h          |   1 +
 .../kleidicv/transform/warp_perspective.h     |   4 +-
 .../src/transform/warp_perspective_neon.cpp   | 264 ++++++++++++++---
 kleidicv/src/transform/warp_perspective_sc.h  | 203 ++++++++++---
 scripts/benchmark/benchmarks.txt              |   4 +
 test/api/test_thread.cpp                      |   4 +-
 test/api/test_warp_perspective.cpp            | 269 +++++++++++++-----
 10 files changed, 595 insertions(+), 166 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ac4a4bfff..8ad96463c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,7 +21,7 @@ This changelog aims to follow the guiding principles of
   - Nearest neighbour, for replicated borders with 1-channel u8 and u16 inputs.
   - Fixed-point interpolation, for replicated borders with 1-channel u8 input.
 - WarpPerspective implementation
-  - Nearest and Linear interpolation method, for replicated borders and 1-channel u8 input.
+  - Nearest and Linear interpolation method, for 1-channel u8 input.
 
 ### Changed
 - Increased precision of sum for 32 bit floats and expose it to OpenCV HAL.
diff --git a/doc/functionality.md b/doc/functionality.md
index f2bf16266..9cc895415 100644
--- a/doc/functionality.md
+++ b/doc/functionality.md
@@ -99,7 +99,7 @@ See `doc/opencv.md` for details of the functionality available in OpenCV.
 | Remap int16+uint16 fixed-point coordinates |  x  |     |
 
 # WarpPerspective
-|                                          |  u8 | u16 |
-|------------------------------------------|-----|-----|
-| Nearest neighbour, replicated borders    |  x  |     |
-| Linear interpolation, replicated borders |  x  |     |
+|                      |  u8 |
+|----------------------|-----|
+| Nearest neighbour    |  x  |
+| Linear interpolation |  x  |
diff --git a/doc/opencv.md b/doc/opencv.md
index a6f1d2fb8..478712113 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -229,7 +229,7 @@ Notes on parameters:
 * `src.cols`, `src.rows`, `dst.cols`, `dst.rows` - must be less than 2^24
 * `src.step` - must be less than 2^32
 * `dst.cols` - must be at least 8
-* `borderMode` - only supports `BORDER_REPLICATE`
+* `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT`
 * `interpolation` - supports `INTER_NEAREST` and `INTER_LINEAR`
 
 ### [`cv::pyrDown()`](https://docs.opencv.org/4.10.0/d4/d86/group__imgproc__filter.html#gaf9bba239dfca11654cb7f50f889fc2ff)
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index 1133718ea..517cd41e5 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1912,6 +1912,7 @@ kleidicv_error_t kleidicv_scharr_interleaved_s16_u8(
 /// @param border_type    Way of handling the border. The supported border types
 ///                       are: \n
 ///                         - @ref KLEIDICV_BORDER_TYPE_REPLICATE
+///                         - @ref KLEIDICV_BORDER_TYPE_CONSTANT
 /// @param border_value   Border value if the border_type is
 ///                       @ref KLEIDICV_BORDER_TYPE_CONSTANT.
 kleidicv_error_t kleidicv_warp_perspective_u8(
diff --git a/kleidicv/include/kleidicv/transform/warp_perspective.h b/kleidicv/include/kleidicv/transform/warp_perspective.h
index 45b512764..cf7962afe 100644
--- a/kleidicv/include/kleidicv/transform/warp_perspective.h
+++ b/kleidicv/include/kleidicv/transform/warp_perspective.h
@@ -35,8 +35,8 @@ inline bool warp_perspective_is_implemented(
     return (dst_width >= 8 &&
             (interpolation == KLEIDICV_INTERPOLATION_NEAREST ||
              interpolation == KLEIDICV_INTERPOLATION_LINEAR) &&
-            border_type ==
-                kleidicv_border_type_t::KLEIDICV_BORDER_TYPE_REPLICATE &&
+            (border_type == KLEIDICV_BORDER_TYPE_REPLICATE ||
+             border_type == KLEIDICV_BORDER_TYPE_CONSTANT) &&
             channels == 1);
   } else {
     return false;
diff --git a/kleidicv/src/transform/warp_perspective_neon.cpp b/kleidicv/src/transform/warp_perspective_neon.cpp
index bba3f7e82..1f4c6dbf4 100644
--- a/kleidicv/src/transform/warp_perspective_neon.cpp
+++ b/kleidicv/src/transform/warp_perspective_neon.cpp
@@ -4,6 +4,8 @@
 
 #include <arm_neon.h>
 
+#include <cassert>
+
 #include "kleidicv/ctypes.h"
 #include "kleidicv/kleidicv.h"
 #include "kleidicv/neon.h"
@@ -31,15 +33,153 @@ namespace kleidicv::neon {
 //      yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8)
 //
 
+namespace {
+
 typedef struct {
   float32x4_t x, y;
 } CoordVectorPair;
 
-template <typename ScalarType, kleidicv_interpolation_type_t Inter,
-          bool IsLarge>
+template <typename ScalarType>
+inline uint32x4_t get_pixels_or_border_large(uint32x4_t x, uint32x4_t y,
+                                             uint32x4_t v_xmax,
+                                             uint32x4_t v_ymax,
+                                             uint32x2_t v_src_stride,
+                                             Rows<const ScalarType> src_rows,
+                                             const ScalarType *border_value) {
+  uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax));
+  // Calculate offsets from coordinates (y * stride + x)
+  // To avoid losing precision, the final offsets should be in 64 bits
+  uint64x2_t offsets_low =
+      vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), v_src_stride);
+  uint64x2_t offsets_high =
+      vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_stride);
+  // Copy pixels from source
+  uint32_t pixel_data[4] = {
+      vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u64(offsets_low, 0)]
+                                  : border_value[0],
+      vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u64(offsets_low, 1)]
+                                  : border_value[0],
+      vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u64(offsets_high, 0)]
+                                  : border_value[0],
+      vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u64(offsets_high, 1)]
+                                  : border_value[0],
+  };
+  return vld1q_u32(pixel_data);
+}
+
+template <typename ScalarType>
+inline uint32x4_t get_pixels_or_border_small(uint32x4_t x, uint32x4_t y,
+                                             uint32x4_t v_xmax,
+                                             uint32x4_t v_ymax,
+                                             uint32x4_t v_src_stride,
+                                             Rows<const ScalarType> src_rows,
+                                             const ScalarType *border_value) {
+  uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax));
+  // Calculate offsets from coordinates (y * stride + x)
+  // Use this path only when the final offsets fit into 32 bits
+  uint32x4_t offsets = vmlaq_u32(x, y, v_src_stride);
+  // Copy pixels from source
+  uint32_t pixel_data[4] = {
+      vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u32(offsets, 0)]
+                                  : border_value[0],
+      vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u32(offsets, 1)]
+                                  : border_value[0],
+      vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u32(offsets, 2)]
+                                  : border_value[0],
+      vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u32(offsets, 3)]
+                                  : border_value[0],
+  };
+  return vld1q_u32(pixel_data);
+}
+
+template <typename ScalarType>
+inline void get_edge_pixels(unsigned &a_result, unsigned &b_result,
+                            unsigned &c_result, unsigned &d_result, int x0,
+                            int y0, ptrdiff_t offset,
+                            Rows<const ScalarType> src_rows, int src_width,
+                            int src_height) {
+  if (y0 >= 0) {
+    if (x0 >= 0) {
+      a_result = src_rows[offset];
+    }
+    if (x0 + 1 < src_width) {
+      b_result = src_rows[offset + 1];
+    }
+  }
+  if (y0 + 1 < src_height) {
+    offset += src_rows.stride();
+    if (x0 >= 0) {
+      c_result = src_rows[offset];
+    }
+    if (x0 + 1 < src_width) {
+      d_result = src_rows[offset + 1];
+    }
+  }
+}
+
+template <typename ScalarType>
+inline uint32x4_t calculate_linear_constant_border(
+    float32x4_t xf, float32x4_t yf, Rows<const ScalarType> src_rows,
+    int src_width, int src_height, const ScalarType *border_value) {
+  // Convert obviously out-of-range coordinates to values that are just beyond
+  // the largest permitted image width & height. This avoids the need for
+  // special case handling elsewhere.
+  float32x4_t big = vdupq_n_f32(1 << 24);
+  xf = vbslq_f32(vcleq_f32(vabsq_f32(xf), big), xf, big);
+  yf = vbslq_f32(vcleq_f32(vabsq_f32(yf), big), yf, big);
+
+  int32x4_t x0 = vcvtmq_s32_f32(xf);
+  int32x4_t y0 = vcvtmq_s32_f32(yf);
+  int x0_array[4], y0_array[4];
+  unsigned a_array[4], b_array[4], c_array[4], d_array[4];
+  vst1q_s32(x0_array, x0);
+  vst1q_s32(y0_array, y0);
+  for (int i = 0; i < 4; ++i) {
+    int x0i = x0_array[i];
+    int y0i = y0_array[i];
+    ptrdiff_t offset = x0i + y0i * src_rows.stride();
+
+    if (x0i < 0 || x0i + 1 >= src_width || y0i < 0 || y0i + 1 >= src_height) {
+      // Not entirely within the source image
+
+      a_array[i] = b_array[i] = c_array[i] = d_array[i] = border_value[0];
+
+      if (x0i < -1 || x0i >= src_width || y0i < -1 || y0i >= src_height) {
+        // Completely outside the source image
+        continue;
+      }
+
+      get_edge_pixels(a_array[i], b_array[i], c_array[i], d_array[i], x0i, y0i,
+                      offset, src_rows, src_width, src_height);
+      continue;
+    }
+
+    // Completely inside the source image
+    a_array[i] = src_rows[offset];
+    b_array[i] = src_rows[offset + 1];
+    offset += src_rows.stride();
+    c_array[i] = src_rows[offset];
+    d_array[i] = src_rows[offset + 1];
+  }
+
+  float32x4_t xfrac = vsubq_f32(xf, vrndmq_f32(xf));
+  float32x4_t yfrac = vsubq_f32(yf, vrndmq_f32(yf));
+  float32x4_t a = vcvtq_f32_u32(vld1q_u32(a_array));
+  float32x4_t b = vcvtq_f32_u32(vld1q_u32(b_array));
+  float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
+  float32x4_t c = vcvtq_f32_u32(vld1q_u32(c_array));
+  float32x4_t d = vcvtq_f32_u32(vld1q_u32(d_array));
+  float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
+  float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
+  return vcvtaq_u32_f32(result);
+}
+
+template <typename ScalarType, bool IsLarge,
+          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
 void warp_perspective_operation(Rows<const ScalarType> src_rows,
                                 size_t src_width, size_t src_height,
                                 const float transform[9],
+                                const ScalarType *border_value,
                                 Rows<ScalarType> dst_rows, size_t dst_width,
                                 size_t y_begin, size_t y_end) {
   static constexpr uint32_t first_few_x[] = {0, 1, 2, 3};
@@ -51,7 +191,6 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
       vdupq_n_u32(static_cast<uint32_t>(src_rows.stride()));
   uint32x4_t v_xmax = vdupq_n_u32(static_cast<uint32_t>(src_width - 1));
   uint32x4_t v_ymax = vdupq_n_u32(static_cast<uint32_t>(src_height - 1));
-
   float32x4_t tx0, ty0, tw0;
 
   auto calculate_coordinates = [&](uint32_t x) {
@@ -106,6 +245,32 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     dst[ix + 3] = src_rows[vgetq_lane_u64(offsets_high, 1)];
   };
 
+  auto vector_path_nearest_constant_border = [&](uint32_t x,
+                                                 Columns<ScalarType> dst) {
+    auto &&[xf, yf] = calculate_coordinates(x);
+    // Convert coordinates to integers.
+    // Negative numbers will become large positive numbers.
+    // Since the source width and height is known to be <=2^24 these large
+    // positive numbers will always be treated as outside the source image
+    // bounds.
+    uint32x4_t xi = vreinterpretq_u32_s32(vcvtaq_s32_f32(xf));
+    uint32x4_t yi = vreinterpretq_u32_s32(vcvtaq_s32_f32(yf));
+    uint32x4_t pixels;
+    if constexpr (IsLarge) {
+      pixels = get_pixels_or_border_large(xi, yi, v_xmax, v_ymax, v_src_stride,
+                                          src_rows, border_value);
+    } else {
+      pixels = get_pixels_or_border_small(xi, yi, v_xmax, v_ymax, vq_src_stride,
+                                          src_rows, border_value);
+    }
+
+    ptrdiff_t ix = static_cast<ptrdiff_t>(x);
+    dst[ix] = static_cast<ScalarType>(vgetq_lane_u32(pixels, 0));
+    dst[ix + 1] = static_cast<ScalarType>(vgetq_lane_u32(pixels, 1));
+    dst[ix + 2] = static_cast<ScalarType>(vgetq_lane_u32(pixels, 2));
+    dst[ix + 3] = static_cast<ScalarType>(vgetq_lane_u32(pixels, 3));
+  };
+
   auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) {
     uint32x4_t offset = vmlaq_u32(x, y, vq_src_stride);
     uint64_t acc =
@@ -134,7 +299,7 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
   };
 
-  auto calculate_linear = [&](uint32_t x) {
+  auto calculate_linear_replicate = [&](uint32_t x) {
     auto load_floats = [&](uint32x4_t x, uint32x4_t y) {
       if constexpr (IsLarge) {
         return load_src_into_floats_large(x, y);
@@ -174,6 +339,18 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     return vminq_u32(vdupq_n_u32(0xFF), vcvtaq_u32_f32(result));
   };
 
+  auto calculate_linear = [&](uint32_t x) {
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
+      return calculate_linear_replicate(x);
+    } else {
+      static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
+      auto &&[xf, yf] = calculate_coordinates(x);
+      return calculate_linear_constant_border(
+          xf, yf, src_rows, static_cast<int>(src_width),
+          static_cast<int>(src_height), border_value);
+    }
+  };
+
   auto process_row = [&](uint32_t y, Columns<ScalarType> dst) {
     float dy = static_cast<float>(y);
     // Calculate half-transformed values at the first pixel (nominators)
@@ -187,7 +364,11 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     static const size_t kStep = VecTraits<float>::num_lanes();
     LoopUnroll2<TryToAvoidTailLoop> loop{dst_width, kStep};
     if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
-      if constexpr (IsLarge) {
+      if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+        loop.unroll_once([&](size_t x) {
+          vector_path_nearest_constant_border(static_cast<uint32_t>(x), dst);
+        });
+      } else if constexpr (IsLarge) {
         loop.unroll_once([&](size_t x) {
           vector_path_nearest_large(static_cast<uint32_t>(x), dst);
         });
@@ -232,6 +413,39 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
   }
 }
 
+// Convert border_type to a template argument.
+template <typename ScalarType, bool IsLarge,
+          kleidicv_interpolation_type_t Inter, typename... Args>
+void warp_perspective_operation(kleidicv_border_type_t border_type,
+                                Args &&...args) {
+  if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) {
+    warp_perspective_operation<ScalarType, IsLarge, Inter,
+                               KLEIDICV_BORDER_TYPE_REPLICATE>(
+        std::forward<Args>(args)...);
+  } else {
+    warp_perspective_operation<ScalarType, IsLarge, Inter,
+                               KLEIDICV_BORDER_TYPE_CONSTANT>(
+        std::forward<Args>(args)...);
+  }
+}
+
+// Convert interpolation_type to a template argument.
+template <typename ScalarType, bool IsLarge, typename... Args>
+void warp_perspective_operation(
+    kleidicv_interpolation_type_t interpolation_type, Args &&...args) {
+  if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) {
+    warp_perspective_operation<ScalarType, IsLarge,
+                               KLEIDICV_INTERPOLATION_NEAREST>(
+        std::forward<Args>(args)...);
+  } else {
+    warp_perspective_operation<ScalarType, IsLarge,
+                               KLEIDICV_INTERPOLATION_LINEAR>(
+        std::forward<Args>(args)...);
+  }
+}
+
+}  // namespace
+
 // Most of the complexity comes from parameter checking.
 // NOLINTBEGIN(readability-function-cognitive-complexity)
 template <typename T>
@@ -240,13 +454,17 @@ kleidicv_error_t warp_perspective_stripe(
     T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
     size_t y_begin, size_t y_end, const float transformation[9],
     size_t channels, kleidicv_interpolation_type_t interpolation,
-    kleidicv_border_type_t, const T *) {
+    kleidicv_border_type_t border_type, const T *border_value) {
   CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
   CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
   CHECK_POINTERS(transformation);
   CHECK_IMAGE_SIZE(src_width, src_height);
   CHECK_IMAGE_SIZE(dst_width, dst_height);
 
+  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
+    return KLEIDICV_ERROR_NULL_POINTER;
+  }
+
   // Calculating in float32_t will only be precise until 24 bits, and
   // multiplication can only be done with 32x32 bits
   if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
@@ -260,35 +478,13 @@ kleidicv_error_t warp_perspective_stripe(
   dst_rows += y_begin;
 
   if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
-    switch (interpolation) {
-      case KLEIDICV_INTERPOLATION_NEAREST:
-        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_NEAREST, true>(
-            src_rows, src_width, src_height, transformation, dst_rows,
-            dst_width, y_begin, y_end);
-        break;
-      case KLEIDICV_INTERPOLATION_LINEAR:
-        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_LINEAR, true>(
-            src_rows, src_width, src_height, transformation, dst_rows,
-            dst_width, y_begin, y_end);
-        break;
-      default:
-        break;  // GCOVR_EXCL_LINE
-    }
+    warp_perspective_operation<T, true>(
+        interpolation, border_type, src_rows, src_width, src_height,
+        transformation, border_value, dst_rows, dst_width, y_begin, y_end);
   } else {
-    switch (interpolation) {
-      case KLEIDICV_INTERPOLATION_NEAREST:
-        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_NEAREST, false>(
-            src_rows, src_width, src_height, transformation, dst_rows,
-            dst_width, y_begin, y_end);
-        break;
-      case KLEIDICV_INTERPOLATION_LINEAR:
-        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_LINEAR, false>(
-            src_rows, src_width, src_height, transformation, dst_rows,
-            dst_width, y_begin, y_end);
-        break;
-      default:
-        break;  // GCOVR_EXCL_LINE
-    }
+    warp_perspective_operation<T, false>(
+        interpolation, border_type, src_rows, src_width, src_height,
+        transformation, border_value, dst_rows, dst_width, y_begin, y_end);
   }
   // NOLINTEND(readability-function-cognitive-complexity)
   return KLEIDICV_OK;
diff --git a/kleidicv/src/transform/warp_perspective_sc.h b/kleidicv/src/transform/warp_perspective_sc.h
index a790a6acf..519d544dd 100644
--- a/kleidicv/src/transform/warp_perspective_sc.h
+++ b/kleidicv/src/transform/warp_perspective_sc.h
@@ -4,7 +4,9 @@
 
 #include <arm_sve.h>
 
+#include <cassert>
 #include <cmath>
+#include <utility>
 
 #include "kleidicv/ctypes.h"
 #include "kleidicv/kleidicv.h"
@@ -37,11 +39,12 @@ namespace KLEIDICV_TARGET_NAMESPACE {
 //      yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8)
 //
 
-template <typename ScalarType, kleidicv_interpolation_type_t Inter,
-          bool IsLarge>
+template <typename ScalarType, bool IsLarge,
+          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
 void warp_perspective_operation(Rows<const ScalarType> src_rows,
                                 size_t src_width, size_t src_height,
                                 const float transform[9],
+                                const ScalarType *border_value,
                                 Rows<ScalarType> dst_rows, size_t dst_width,
                                 size_t y_begin, size_t y_end) {
   svbool_t pg_all64 = svptrue_b64();
@@ -50,6 +53,13 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
   svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
   svuint32_t sv_ymax = svdup_n_u32(src_height - 1);
   svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride());
+  svuint32_t sv_border;
+  // sv_border is only used if the border type is constant.
+  // If the border type is not constant then border_value is permitted to be
+  // null and must not be read.
+  if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+    sv_border = svdup_n_u32(border_value[0]);
+  }
 
   svfloat32_t T0 = svdup_n_f32(transform[0]);
   svfloat32_t T3 = svdup_n_f32(transform[3]);
@@ -82,14 +92,23 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
 
   auto calculate_nearest_coordinates = [&](size_t x) {
     calculate_coordinates(x);
-    // Round to the nearest integer, clamp it to within the dimensions of the
-    // source image (negative values are already saturated to 0)
-    xi = svmin_x(pg_all32,
-                 svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)),
-                 sv_xmax);
-    yi = svmin_x(pg_all32,
-                 svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)),
-                 sv_ymax);
+
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+      // Round to the nearest integer
+      xi = svreinterpret_u32_s32(
+          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf)));
+      yi = svreinterpret_u32_s32(
+          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf)));
+    } else {
+      // Round to the nearest integer, clamp it to within the dimensions of the
+      // source image (negative values are already saturated to 0)
+      xi = svmin_x(pg_all32,
+                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)),
+                   sv_xmax);
+      yi = svmin_x(pg_all32,
+                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)),
+                   sv_ymax);
+    }
   };
 
   auto load_small = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
@@ -112,9 +131,28 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
                       svreinterpret_u32_u64(result_t));
   };
 
+  auto get_pixels_or_border = [&](svuint32_t x, svuint32_t y) {
+    svbool_t in_range = svand_b_z(pg32, svcmple_u32(pg32, x, sv_xmax),
+                                  svcmple_u32(pg32, y, sv_ymax));
+
+    svuint32_t result;
+    if constexpr (IsLarge) {
+      svbool_t pg_b = in_range;
+      svbool_t pg_t = svtrn2_b32(in_range, svpfalse());
+      result = load_large(pg_b, pg_t, x, y);
+    } else {
+      result = load_small(in_range, x, y);
+    }
+
+    // Select between source pixels and border colour
+    return svsel_u32(in_range, result, sv_border);
+  };
+
   auto vector_path_nearest_4x = [&](size_t x, Columns<ScalarType> dst) {
     auto load_source = [&](svuint32_t x, svuint32_t y) {
-      if constexpr (IsLarge) {
+      if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+        return get_pixels_or_border(x, y);
+      } else if constexpr (IsLarge) {
         return load_large(pg_all64, pg_all64, x, y);
       } else {
         return load_small(pg_all32, x, y);
@@ -144,16 +182,22 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
   auto vector_path_nearest_tail = [&](size_t x, size_t x_max,
                                       Columns<ScalarType> dst) {
     size_t length = x_max - x;
-    svbool_t pg_b = svwhilelt_b64(0ULL, (length + 1) / 2);
-    svbool_t pg_t = svwhilelt_b64(0ULL, length / 2);
-    svbool_t pg = svwhilelt_b32(0ULL, length);
-    // To avoid losing precision, the final indices use 64 bits
+    pg64_b = svwhilelt_b64(0ULL, (length + 1) / 2);
+    pg64_t = svwhilelt_b64(0ULL, length / 2);
+    pg32 = svwhilelt_b32(0ULL, length);
     calculate_nearest_coordinates(x);
-    svuint32_t result = load_large(pg_b, pg_t, xi, yi);
-    svst1b_u32(pg, &dst[static_cast<ptrdiff_t>(x)], result);
+    svuint32_t result;
+
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+      result = get_pixels_or_border(xi, yi);
+    } else {
+      // To avoid losing precision, the final indices use 64 bits
+      result = load_large(pg64_b, pg64_t, xi, yi);
+    }
+    svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
   };
 
-  auto calculate_linear = [&](size_t x) {
+  auto calculate_linear_replicate = [&](uint32_t x) {
     auto load_source = [&](svuint32_t x, svuint32_t y) {
       if constexpr (IsLarge) {
         return load_large(pg64_b, pg64_t, x, y);
@@ -201,6 +245,59 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
         svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F)));
   };
 
+  auto calculate_linear_constant_border = [&](uint32_t x) {
+    calculate_coordinates(x);
+
+    // Convert obviously out-of-range coordinates to values that are just beyond
+    // the largest permitted image width & height. This avoids the need for
+    // special case handling elsewhere.
+    svfloat32_t big = svdup_n_f32(1 << 24);
+    xf = svsel_f32(svcmple_f32(pg_all32, svabs_f32_x(pg_all32, xf), big), xf,
+                   big);
+    yf = svsel_f32(svcmple_f32(pg_all32, svabs_f32_x(pg_all32, yf), big), yf,
+                   big);
+
+    svfloat32_t xf0 = svrintm_f32_x(pg_all32, xf);
+    svfloat32_t yf0 = svrintm_f32_x(pg_all32, yf);
+
+    svint32_t x0 = svcvt_s32_x(pg_all32, xf0);
+    svint32_t y0 = svcvt_s32_x(pg_all32, yf0);
+    svint32_t x1 = svadd_s32_x(pg_all32, x0, svdup_n_s32(1));
+    svint32_t y1 = svadd_s32_x(pg_all32, y0, svdup_n_s32(1));
+
+    svfloat32_t xfrac = svsub_f32_x(pg_all32, xf, xf0);
+    svfloat32_t yfrac = svsub_f32_x(pg_all32, yf, yf0);
+
+    svfloat32_t a = svcvt_f32_u32_x(
+        pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x0),
+                                       svreinterpret_u32_s32(y0)));
+    svfloat32_t b = svcvt_f32_u32_x(
+        pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x1),
+                                       svreinterpret_u32_s32(y0)));
+    svfloat32_t line0 =
+        svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
+    svfloat32_t c = svcvt_f32_u32_x(
+        pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x0),
+                                       svreinterpret_u32_s32(y1)));
+    svfloat32_t d = svcvt_f32_u32_x(
+        pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x1),
+                                       svreinterpret_u32_s32(y1)));
+    svfloat32_t line1 =
+        svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
+    svfloat32_t result = svmla_f32_x(
+        pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
+    return svcvt_u32_f32_x(pg_all32, svrinta_f32_x(pg_all32, result));
+  };
+
+  auto calculate_linear = [&](uint32_t x) {
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
+      return calculate_linear_replicate(x);
+    } else {
+      static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
+      return calculate_linear_constant_border(x);
+    }
+  };
+
   auto process_row = [&](size_t y, Columns<ScalarType> dst) {
     float fy = static_cast<float>(y);
     // Calculate half-transformed values at the first pixel (nominators)
@@ -269,6 +366,37 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
   }
 }
 
+// Convert border_type to a template argument.
+template <typename ScalarType, bool IsLarge,
+          kleidicv_interpolation_type_t Inter, typename... Args>
+void warp_perspective_operation(kleidicv_border_type_t border_type,
+                                Args &&...args) {
+  if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) {
+    warp_perspective_operation<ScalarType, IsLarge, Inter,
+                               KLEIDICV_BORDER_TYPE_REPLICATE>(
+        std::forward<Args>(args)...);
+  } else {
+    warp_perspective_operation<ScalarType, IsLarge, Inter,
+                               KLEIDICV_BORDER_TYPE_CONSTANT>(
+        std::forward<Args>(args)...);
+  }
+}
+
+// Convert interpolation_type to a template argument.
+template <typename ScalarType, bool IsLarge, typename... Args>
+void warp_perspective_operation(
+    kleidicv_interpolation_type_t interpolation_type, Args &&...args) {
+  if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) {
+    warp_perspective_operation<ScalarType, IsLarge,
+                               KLEIDICV_INTERPOLATION_NEAREST>(
+        std::forward<Args>(args)...);
+  } else {
+    warp_perspective_operation<ScalarType, IsLarge,
+                               KLEIDICV_INTERPOLATION_LINEAR>(
+        std::forward<Args>(args)...);
+  }
+}
+
 // Most of the complexity comes from parameter checking.
 // NOLINTBEGIN(readability-function-cognitive-complexity)
 template <typename T>
@@ -277,12 +405,15 @@ KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc(
     T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
     size_t y_begin, size_t y_end, const float transformation[9],
     size_t channels, kleidicv_interpolation_type_t interpolation,
-    kleidicv_border_type_t, const T *) {
+    kleidicv_border_type_t border_type, const T *border_value) {
   CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
   CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
   CHECK_POINTERS(transformation);
   CHECK_IMAGE_SIZE(src_width, src_height);
   CHECK_IMAGE_SIZE(dst_width, dst_height);
+  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
+    return KLEIDICV_ERROR_NULL_POINTER;
+  }
 
   // Calculating in float32_t will only be precise until 24 bits, and
   // multiplication can only be done with 32x32 bits
@@ -299,35 +430,13 @@ KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc(
   dst_rows += y_begin;
 
   if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
-    switch (interpolation) {
-      case KLEIDICV_INTERPOLATION_NEAREST:
-        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_NEAREST, true>(
-            src_rows, src_width, src_height, transformation, dst_rows,
-            dst_width, y_begin, y_end);
-        break;
-      case KLEIDICV_INTERPOLATION_LINEAR:
-        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_LINEAR, true>(
-            src_rows, src_width, src_height, transformation, dst_rows,
-            dst_width, y_begin, y_end);
-        break;
-      default:
-        break;  // GCOVR_EXCL_LINE
-    }
+    warp_perspective_operation<T, true>(
+        interpolation, border_type, src_rows, src_width, src_height,
+        transformation, border_value, dst_rows, dst_width, y_begin, y_end);
   } else {
-    switch (interpolation) {
-      case KLEIDICV_INTERPOLATION_NEAREST:
-        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_NEAREST, false>(
-            src_rows, src_width, src_height, transformation, dst_rows,
-            dst_width, y_begin, y_end);
-        break;
-      case KLEIDICV_INTERPOLATION_LINEAR:
-        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_LINEAR, false>(
-            src_rows, src_width, src_height, transformation, dst_rows,
-            dst_width, y_begin, y_end);
-        break;
-      default:
-        break;  // GCOVR_EXCL_LINE
-    }
+    warp_perspective_operation<T, false>(
+        interpolation, border_type, src_rows, src_width, src_height,
+        transformation, border_value, dst_rows, dst_width, y_begin, y_end);
   }
 
   return KLEIDICV_OK;
diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt
index 601720b43..4eb782d1c 100755
--- a/scripts/benchmark/benchmarks.txt
+++ b/scripts/benchmark/benchmarks.txt
@@ -85,6 +85,10 @@ WarpPerspective_Nearest: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMA
 WarpPerspectiveNear_Nearest: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)'
 WarpPerspective_Linear: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 8UC1)'
 WarpPerspectiveNear_Linear: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 8UC1)'
+WarpPerspective_Nearest_Constant: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_CONSTANT, 8UC1)'
+WarpPerspectiveNear_Nearest_Constant: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_CONSTANT, 8UC1)'
+WarpPerspective_Linear_Constant: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_CONSTANT, 8UC1)'
+WarpPerspectiveNear_Linear_Constant: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_CONSTANT, 8UC1)'
 
 BlurAndDownsample: opencv_perf_imgproc '*pyrDown/*' '($PIXEL_FORMAT, 8UC1)'
 
diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp
index 862f0229d..27f6a6ba4 100644
--- a/test/api/test_thread.cpp
+++ b/test/api/test_thread.cpp
@@ -666,13 +666,13 @@ TEST_P(Thread, warp_perspective_u8_not_implemented) {
       KLEIDICV_BORDER_TYPE_REPLICATE, border_value);
   check_warp_perspective_not_implemented<uint8_t>(
       kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_NEAREST,
-      KLEIDICV_BORDER_TYPE_CONSTANT, border_value);
+      KLEIDICV_BORDER_TYPE_REFLECT, border_value);
   check_warp_perspective_not_implemented<uint8_t>(
       kleidicv_thread_warp_perspective_u8, 2, KLEIDICV_INTERPOLATION_LINEAR,
       KLEIDICV_BORDER_TYPE_REPLICATE, border_value);
   check_warp_perspective_not_implemented<uint8_t>(
       kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_LINEAR,
-      KLEIDICV_BORDER_TYPE_CONSTANT, border_value);
+      KLEIDICV_BORDER_TYPE_REFLECT, border_value);
 }
 
 TEST_P(Thread, SobelHorizontal1Channel) {
diff --git a/test/api/test_warp_perspective.cpp b/test/api/test_warp_perspective.cpp
index 49cd9a674..482598002 100644
--- a/test/api/test_warp_perspective.cpp
+++ b/test/api/test_warp_perspective.cpp
@@ -4,6 +4,7 @@
 
 #include <gtest/gtest.h>
 
+#include <climits>
 #include <cmath>
 #include <limits>
 
@@ -53,13 +54,43 @@ static void sequential_initializer(test::Array2D<ScalarType> &source) {
   }
 }
 
+template <typename ScalarType>
+static const ScalarType *get_array2d_element_or_border(
+    const test::Array2D<ScalarType> &src, ptrdiff_t x, ptrdiff_t y,
+    kleidicv_border_type_t border_type, const ScalarType *border_value) {
+  if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) {
+    x = std::clamp<ptrdiff_t>(x, 0, static_cast<ptrdiff_t>(src.width()) - 1);
+    y = std::clamp<ptrdiff_t>(y, 0, static_cast<ptrdiff_t>(src.height()) - 1);
+  } else {
+    assert(border_type == KLEIDICV_BORDER_TYPE_CONSTANT);
+    if (x >= static_cast<ptrdiff_t>(src.width()) ||
+        y >= static_cast<ptrdiff_t>(src.height()) || x < 0 || y < 0) {
+      return border_value;
+    }
+  }
+  return src.at(y, x * src.channels());
+}
+
+template <typename T>
+static const auto &get_borders() {
+  using P = std::pair<kleidicv_border_type_t, const T *>;
+  static const T border_value[KLEIDICV_MAXIMUM_CHANNEL_COUNT] = {4, 3, 2, 1};
+  static const std::array borders{
+      P{KLEIDICV_BORDER_TYPE_REPLICATE, nullptr},
+      P{KLEIDICV_BORDER_TYPE_REPLICATE, border_value},
+      P{KLEIDICV_BORDER_TYPE_CONSTANT, border_value},
+  };
+  return borders;
+}
+
 template <class ScalarType>
 class WarpPerspectiveNearest : public testing::Test {
  public:
   static void test_stripe(size_t src_w, size_t src_h, size_t dst_w,
                           size_t dst_h, size_t y_begin, size_t y_end,
                           const float transform[9], size_t channels,
-                          size_t padding,
+                          kleidicv_border_type_t border_type,
+                          const ScalarType *border_value, size_t padding,
                           void (*initializer)(test::Array2D<ScalarType> &) =
                               sequential_initializer) {
     size_t src_total_width = channels * src_w;
@@ -77,7 +108,8 @@ class WarpPerspectiveNearest : public testing::Test {
       }
     }
 
-    calculate_expected(source, y_begin, y_end, transform, expected);
+    calculate_expected(source, y_begin, y_end, transform, border_type,
+                       border_value, expected);
 
     ASSERT_EQ(
         KLEIDICV_OK,
@@ -85,7 +117,7 @@ class WarpPerspectiveNearest : public testing::Test {
             source.data(), source.stride(), source.width(), source.height(),
             actual.data(), actual.stride(), actual.width(), actual.height(),
             y_begin, y_end, transform, channels, KLEIDICV_INTERPOLATION_NEAREST,
-            KLEIDICV_BORDER_TYPE_REPLICATE, {}));
+            border_type, border_value));
 
     ScalarType threshold = 1;
     for (size_t row = y_begin; row < y_end; ++row) {
@@ -114,16 +146,19 @@ class WarpPerspectiveNearest : public testing::Test {
   }
 
   static void test(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h,
-                   const float transform[9], size_t channels, size_t padding,
+                   const float transform[9], size_t channels,
+                   kleidicv_border_type_t border_type,
+                   const ScalarType *border_value, size_t padding,
                    void (*initializer)(test::Array2D<ScalarType> &) =
                        sequential_initializer) {
     test_stripe(src_w, src_h, dst_w, dst_h, 0, dst_h, transform, channels,
-                padding, initializer);
+                border_type, border_value, padding, initializer);
   }
 
   static void test_special_source(
       size_t src_w, size_t src_h, size_t src_stride, size_t dst_w, size_t dst_h,
       const float transform[9], size_t channels,
+      kleidicv_border_type_t border_type, const ScalarType *border_value,
       void (*initializer)(test::Array2D<ScalarType> &) =
           sequential_initializer) {
     size_t src_total_width = channels * src_w;
@@ -142,14 +177,15 @@ class WarpPerspectiveNearest : public testing::Test {
 
     actual.fill(42);
 
-    calculate_expected(source, 0, expected.height(), transform, expected);
+    calculate_expected(source, 0, expected.height(), transform, border_type,
+                       border_value, expected);
 
     ASSERT_EQ(KLEIDICV_OK,
               kleidicv_warp_perspective_u8(
                   source.data(), src_stride, src_w, src_h, actual.data(),
                   actual.stride(), actual.width(), actual.height(), transform,
-                  channels, KLEIDICV_INTERPOLATION_NEAREST,
-                  KLEIDICV_BORDER_TYPE_REPLICATE, {}));
+                  channels, KLEIDICV_INTERPOLATION_NEAREST, border_type,
+                  border_value));
 
     EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected);
   }
@@ -157,7 +193,14 @@ class WarpPerspectiveNearest : public testing::Test {
  private:
   static void calculate_expected(test::Array2D<ScalarType> &src, size_t y_begin,
                                  size_t y_end, const float transform[9],
+                                 kleidicv_border_type_t border_type,
+                                 const ScalarType *border_value,
                                  test::Array2D<ScalarType> &expected) {
+    auto get_src = [&](ptrdiff_t x, ptrdiff_t y) {
+      return get_array2d_element_or_border(src, x, y, border_type,
+                                           border_value);
+    };
+
     for (size_t y = y_begin; y < y_end; ++y) {
       for (size_t x = 0; x < expected.width() / src.channels(); ++x) {
         float dx = static_cast<float>(x), dy = static_cast<float>(y);
@@ -168,13 +211,14 @@ class WarpPerspectiveNearest : public testing::Test {
             inv_w * (transform[0] * dx + transform[1] * dy + transform[2]);
         float fy =
             inv_w * (transform[3] * dx + transform[4] * dy + transform[5]);
-        ptrdiff_t ix = static_cast<ptrdiff_t>(
-            std::max<double>(0, std::min(fx + 0.5, src.width() - 1.0)));
-        ptrdiff_t iy = static_cast<ptrdiff_t>(
-            std::max<double>(0, std::min(fy + 0.5, src.height() - 1.0)));
+        ptrdiff_t ix = static_cast<ptrdiff_t>(std::max<double>(
+            INT_MIN,
+            std::min<double>(std::round(fx), KLEIDICV_MAX_IMAGE_PIXELS)));
+        ptrdiff_t iy = static_cast<ptrdiff_t>(std::max<double>(
+            INT_MIN,
+            std::min<double>(std::round(fy), KLEIDICV_MAX_IMAGE_PIXELS)));
         for (size_t ch = 0; ch < src.channels(); ++ch) {
-          *expected.at(y, x * src.channels() + ch) =
-              *src.at(iy, ix * src.channels() + ch);
+          *expected.at(y, x * src.channels() + ch) = get_src(ix, iy)[ch];
         }
       }
     }
@@ -189,7 +233,10 @@ TYPED_TEST(WarpPerspectiveNearest, IdentityNoPadding) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1, 0);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1,
+                      border_type, border_value, 0);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveNearest, TransposeNoPadding) {
@@ -197,7 +244,10 @@ TYPED_TEST(WarpPerspectiveNearest, TransposeNoPadding) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1, 0);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1,
+                      border_type, border_value, 0);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveNearest, SmallPadding) {
@@ -205,7 +255,10 @@ TYPED_TEST(WarpPerspectiveNearest, SmallPadding) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1, 13);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1,
+                      border_type, border_value, 13);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveNearest, Upscale) {
@@ -221,7 +274,10 @@ TYPED_TEST(WarpPerspectiveNearest, Upscale) {
   size_t src_h = 4;
   size_t dst_w = src_w * 3;
   size_t dst_h = src_h * 2;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1,
+                      border_type, border_value, 3);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveNearest, RandomTransform) {
@@ -235,7 +291,10 @@ TYPED_TEST(WarpPerspectiveNearest, RandomTransform) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, 19);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, border_type,
+                      border_value, 19);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveNearest, DivisionByZero) {
@@ -251,7 +310,10 @@ TYPED_TEST(WarpPerspectiveNearest, DivisionByZero) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1, 3);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1,
+                      border_type, border_value, 3);
+  }
 }
 
 static const size_t kBigWidth = 1ULL << 17, kBigHeight = 1ULL << 17;
@@ -270,8 +332,8 @@ static void part_initializer(test::Array2D<ScalarType> &source) {
 TYPED_TEST(WarpPerspectiveNearest, BigSourceImage) {
   // clang-format off
   const float transform_cut[] = {
-    1, 0, kBigWidth,
-    0, 1, kBigHeight,
+    1, 0, kBigWidth + kPartWidth * 0.5F,
+    0, 1, kBigHeight + kPartHeight * 0.5F,
     0, 0, 1
   };
   // clang-format on
@@ -281,9 +343,10 @@ TYPED_TEST(WarpPerspectiveNearest, BigSourceImage) {
   size_t dst_h = kPartHeight;
   size_t src_w = kBigWidth + kPartWidth;
   size_t src_h = kBigHeight + kPartHeight;
-
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, 0,
-                    part_initializer);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, border_type,
+                      border_value, 0, part_initializer);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveNearest, BigWidthDestination) {
@@ -299,7 +362,10 @@ TYPED_TEST(WarpPerspectiveNearest, BigWidthDestination) {
   size_t dst_h = 1;
   size_t src_w = dst_w / 1000;
   size_t src_h = dst_h;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1,
+                      border_type, border_value, 3);
+  }
 }
 
 static const size_t kHugeHeight = (1ULL << 24) - 1;
@@ -316,8 +382,11 @@ TYPED_TEST(WarpPerspectiveNearest, BigHeightDestination) {
   size_t dst_h = kHugeHeight;
   size_t src_w = dst_w;
   size_t src_h = dst_h / 10000;
-  TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight,
-                           dst_h, transform_upscale, 1, 0);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight,
+                             dst_h, transform_upscale, 1, border_type,
+                             border_value, 0);
+  }
 }
 
 template <class ScalarType>
@@ -343,9 +412,11 @@ TYPED_TEST(WarpPerspectiveNearest, HugeHeightSourceAndDestination) {
   size_t dst_h = (1ULL << 24) - 1;
   size_t src_w = dst_w;
   size_t src_h = dst_h;
-  TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - 16, dst_h,
-                           transform, 1, 0,
-                           huge_height_part_initializer<TypeParam>);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - 16, dst_h,
+                             transform, 1, border_type, border_value, 0,
+                             huge_height_part_initializer<TypeParam>);
+  }
 }
 
 static const size_t oneline_big_width = 1ULL << 23;
@@ -375,11 +446,15 @@ TYPED_TEST(WarpPerspectiveNearest, OneLineBigSourceImage) {
   size_t src_w = oneline_big_width + oneline_part_width;
   size_t src_h = 1;
 
-  TestFixture::test_special_source(src_w, src_h, 0, dst_w, dst_h, tr, 1,
-                                   oneline_part_initializer);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_special_source(src_w, src_h, 0, dst_w, dst_h, tr, 1,
+                                     border_type, border_value,
+                                     oneline_part_initializer);
 
-  TestFixture::test_special_source(src_w, src_h, (1ULL << 32) - 1, dst_w, dst_h,
-                                   tr, 1, oneline_part_initializer);
+    TestFixture::test_special_source(src_w, src_h, (1ULL << 32) - 1, dst_w,
+                                     dst_h, tr, 1, border_type, border_value,
+                                     oneline_part_initializer);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveNearest, OneLineSmallSourceImage) {
@@ -396,15 +471,19 @@ TYPED_TEST(WarpPerspectiveNearest, OneLineSmallSourceImage) {
   size_t src_w = 2;
   size_t src_h = 1;
 
-  TestFixture::test_special_source(src_w, src_h, src_h, dst_w, dst_h, tr, 1);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_special_source(src_w, src_h, src_h, dst_w, dst_h, tr, 1,
+                                     border_type, border_value);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveNearest, NullPointer) {
   const TypeParam src[4] = {};
+  const TypeParam border_value[KLEIDICV_MAXIMUM_CHANNEL_COUNT] = {};
   TypeParam dst[8];
   test::test_null_args(kleidicv_warp_perspective_u8, src, 2, 2, 2, dst, 8, 8, 1,
                        transform_identity, 1, KLEIDICV_INTERPOLATION_NEAREST,
-                       KLEIDICV_BORDER_TYPE_REPLICATE, nullptr);
+                       KLEIDICV_BORDER_TYPE_CONSTANT, border_value);
 }
 
 TYPED_TEST(WarpPerspectiveNearest, ZeroImageSize) {
@@ -465,15 +544,16 @@ TYPED_TEST(WarpPerspectiveNearest, UnsupportedTwoChannels) {
                 nullptr));
 }
 
-TYPED_TEST(WarpPerspectiveNearest, UnsupportedBorderTypeConst) {
+TYPED_TEST(WarpPerspectiveNearest, UnsupportedBorderType) {
   const TypeParam src[1] = {};
+  const TypeParam border_value[KLEIDICV_MAXIMUM_CHANNEL_COUNT] = {};
   TypeParam dst[8];
 
   EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
             kleidicv_warp_perspective_u8(
                 src, 1, 1, 1, dst, 8, 8, 1, transform_identity, 1,
-                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_CONSTANT,
-                nullptr));
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REFLECT,
+                border_value));
 }
 
 TYPED_TEST(WarpPerspectiveNearest, UnsupportedTooSmallImage) {
@@ -552,7 +632,8 @@ class WarpPerspectiveLinear : public testing::Test {
   static void test_stripe(size_t src_w, size_t src_h, size_t dst_w,
                           size_t dst_h, size_t y_begin, size_t y_end,
                           const float transform[9], size_t channels,
-                          size_t padding,
+                          kleidicv_border_type_t border_type,
+                          const ScalarType *border_value, size_t padding,
                           void (*initializer)(test::Array2D<ScalarType> &) =
                               sequential_initializer) {
     size_t src_total_width = channels * src_w;
@@ -570,7 +651,8 @@ class WarpPerspectiveLinear : public testing::Test {
       }
     }
 
-    calculate_expected(source, y_begin, y_end, transform, expected);
+    calculate_expected(source, y_begin, y_end, transform, border_type,
+                       border_value, expected);
 
     ASSERT_EQ(
         KLEIDICV_OK,
@@ -578,7 +660,7 @@ class WarpPerspectiveLinear : public testing::Test {
             source.data(), source.stride(), source.width(), source.height(),
             actual.data(), actual.stride(), actual.width(), actual.height(),
             y_begin, y_end, transform, channels, KLEIDICV_INTERPOLATION_LINEAR,
-            KLEIDICV_BORDER_TYPE_REPLICATE, {}));
+            border_type, border_value));
 
     ScalarType threshold = 1;
     for (size_t row = y_begin; row < y_end; ++row) {
@@ -607,17 +689,26 @@ class WarpPerspectiveLinear : public testing::Test {
   }
 
   static void test(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h,
-                   const float transform[9], size_t channels, size_t padding,
+                   const float transform[9], size_t channels,
+                   kleidicv_border_type_t border_type,
+                   const ScalarType *border_value, size_t padding,
                    void (*initializer)(test::Array2D<ScalarType> &) =
                        sequential_initializer) {
     test_stripe(src_w, src_h, dst_w, dst_h, 0, dst_h, transform, channels,
-                padding, initializer);
+                border_type, border_value, padding, initializer);
   }
 
  private:
   static void calculate_expected(test::Array2D<ScalarType> &src, size_t y_begin,
                                  size_t y_end, const float transform[9],
+                                 kleidicv_border_type_t border_type,
+                                 const ScalarType *border_value,
                                  test::Array2D<ScalarType> &expected) {
+    auto get_src = [&](ptrdiff_t x, ptrdiff_t y) {
+      return get_array2d_element_or_border(src, x, y, border_type,
+                                           border_value);
+    };
+
     for (size_t y = y_begin; y < y_end; ++y) {
       for (size_t x = 0; x < expected.width() / src.channels(); ++x) {
         double dx = static_cast<double>(x), dy = static_cast<double>(y);
@@ -627,21 +718,19 @@ class WarpPerspectiveLinear : public testing::Test {
         double ty = transform[3] * dx + transform[4] * dy + transform[5];
         double fx = inv_w * tx;
         double fy = inv_w * ty;
-        uint32_t ix0 = static_cast<uint32_t>(
-            std::max(0.0, std::min(fx, src.width() - 1.0)));
-        uint32_t iy0 = static_cast<uint32_t>(
-            std::max(0.0, std::min(fy, src.height() - 1.0)));
-        uint32_t ix1 = (ix0 < src.width() - 1 && fx >= 0) ? ix0 + 1 : ix0;
-        uint32_t iy1 = (iy0 < src.height() - 1 && fy >= 0) ? iy0 + 1 : iy0;
-        double xfrac =
-            (ix0 < src.width() - 1 && fx >= 0) ? fx - floor(fx) : 0.0;
-        double yfrac =
-            (iy0 < src.height() - 1 && fy >= 0) ? fy - floor(fy) : 0.0;
+        ptrdiff_t ix0 = static_cast<ptrdiff_t>(std::max<double>(
+            INT_MIN, std::min<double>(floor(fx), KLEIDICV_MAX_IMAGE_PIXELS)));
+        ptrdiff_t iy0 = static_cast<ptrdiff_t>(std::max<double>(
+            INT_MIN, std::min<double>(floor(fy), KLEIDICV_MAX_IMAGE_PIXELS)));
+        ptrdiff_t ix1 = ix0 + 1;
+        ptrdiff_t iy1 = iy0 + 1;
+        double xfrac = std::isfinite(fx) ? fx - floor(fx) : 0.0;
+        double yfrac = std::isfinite(fy) ? fy - floor(fy) : 0.0;
         for (size_t ch = 0; ch < src.channels(); ++ch) {
-          double a = *src.at(iy0, ix0 * src.channels() + ch);
-          double b = *src.at(iy0, ix1 * src.channels() + ch);
-          double c = *src.at(iy1, ix0 * src.channels() + ch);
-          double d = *src.at(iy1, ix1 * src.channels() + ch);
+          double a = get_src(ix0, iy0)[ch];
+          double b = get_src(ix1, iy0)[ch];
+          double c = get_src(ix0, iy1)[ch];
+          double d = get_src(ix1, iy1)[ch];
           double line1 = (b - a) * xfrac + a;
           double line2 = (d - c) * xfrac + c;
           double double_result = (line2 - line1) * yfrac + line1;
@@ -704,7 +793,10 @@ TYPED_TEST(WarpPerspectiveLinear, IdentityNoPadding) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1, 0);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1,
+                      border_type, border_value, 0);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveLinear, TransposeNoPadding) {
@@ -712,7 +804,10 @@ TYPED_TEST(WarpPerspectiveLinear, TransposeNoPadding) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1, 0);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1,
+                      border_type, border_value, 0);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveLinear, SmallPadding) {
@@ -720,7 +815,10 @@ TYPED_TEST(WarpPerspectiveLinear, SmallPadding) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1, 13);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1,
+                      border_type, border_value, 13);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveLinear, Upscale) {
@@ -736,7 +834,10 @@ TYPED_TEST(WarpPerspectiveLinear, Upscale) {
   size_t src_h = 4;
   size_t dst_w = src_w * 3;
   size_t dst_h = src_h * 2;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1,
+                      border_type, border_value, 3);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveLinear, RandomTransform) {
@@ -758,7 +859,10 @@ TYPED_TEST(WarpPerspectiveLinear, RandomTransform) {
     size_t src_h = 4;
     size_t dst_w = src_w;
     size_t dst_h = src_h;
-    TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, 19);
+    for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+      TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, border_type,
+                        border_value, 19);
+    }
   }
 }
 
@@ -775,7 +879,10 @@ TYPED_TEST(WarpPerspectiveLinear, DivisionByZero) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1, 3);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1,
+                      border_type, border_value, 3);
+  }
 }
 
 template <class ScalarType>
@@ -801,8 +908,10 @@ TYPED_TEST(WarpPerspectiveLinear, RoughSource) {
   size_t src_h = 634;
   size_t dst_w = 17;
   size_t dst_h = 852;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transformation, 1, 13,
-                    crisscross_initializer);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transformation, 1,
+                      border_type, border_value, 13, crisscross_initializer);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveLinear, BigSourceImage) {
@@ -820,8 +929,10 @@ TYPED_TEST(WarpPerspectiveLinear, BigSourceImage) {
   size_t src_w = kBigWidth + kPartWidth + 3;
   size_t src_h = kBigHeight + kPartHeight;
 
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, 0,
-                    part_initializer);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, border_type,
+                      border_value, 0, part_initializer);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveLinear, BigWidthDestination) {
@@ -837,7 +948,10 @@ TYPED_TEST(WarpPerspectiveLinear, BigWidthDestination) {
   size_t dst_h = 1;
   size_t src_w = dst_w / 1000;
   size_t src_h = dst_h;
-  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1,
+                      border_type, border_value, 3);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveLinear, BigHeightDestination) {
@@ -853,8 +967,11 @@ TYPED_TEST(WarpPerspectiveLinear, BigHeightDestination) {
   size_t dst_h = kHugeHeight;
   size_t src_w = dst_w;
   size_t src_h = dst_h / 10000;
-  TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight,
-                           dst_h, transform_upscale, 1, 0);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight,
+                             dst_h, transform_upscale, 1, border_type,
+                             border_value, 0);
+  }
 }
 
 TYPED_TEST(WarpPerspectiveLinear, HugeHeightSourceAndDestination) {
@@ -870,7 +987,9 @@ TYPED_TEST(WarpPerspectiveLinear, HugeHeightSourceAndDestination) {
   size_t dst_h = (1ULL << 24) - 1;
   size_t src_w = dst_w;
   size_t src_h = dst_h;
-  TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight,
-                           dst_h, transform, 1, 0,
-                           huge_height_part_initializer<TypeParam>);
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight,
+                             dst_h, transform, 1, border_type, border_value, 0,
+                             huge_height_part_initializer<TypeParam>);
+  }
 }
-- 
GitLab


From dba6435748eceff28a78e7855748a399c187b5e9 Mon Sep 17 00:00:00 2001
From: Michael Platings <michael.platings@arm.com>
Date: Tue, 14 Jan 2025 15:50:05 +0000
Subject: [PATCH 2/2] Refactor warp_perspective_sc.h

The use of mutable variables across many lambdas had the effect of
action-at-a-distance [1].

[1] https://en.wikipedia.org/wiki/Action_at_a_distance_(computer_programming)
---
 kleidicv/src/transform/warp_perspective_sc.h | 217 ++++++++-----------
 1 file changed, 96 insertions(+), 121 deletions(-)

diff --git a/kleidicv/src/transform/warp_perspective_sc.h b/kleidicv/src/transform/warp_perspective_sc.h
index 519d544dd..20cda8d79 100644
--- a/kleidicv/src/transform/warp_perspective_sc.h
+++ b/kleidicv/src/transform/warp_perspective_sc.h
@@ -47,7 +47,6 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
                                 const ScalarType *border_value,
                                 Rows<ScalarType> dst_rows, size_t dst_width,
                                 size_t y_begin, size_t y_end) {
-  svbool_t pg_all64 = svptrue_b64();
   svbool_t pg_all32 = svptrue_b32();
   svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1));
   svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
@@ -65,13 +64,8 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
   svfloat32_t T3 = svdup_n_f32(transform[3]);
   svfloat32_t T6 = svdup_n_f32(transform[6]);
   svfloat32_t tx0, ty0, tw0;
-  svfloat32_t xf, yf;
   svfloat32_t xmaxf = svdup_n_f32(static_cast<float>(src_width - 1));
   svfloat32_t ymaxf = svdup_n_f32(static_cast<float>(src_height - 1));
-  svuint32_t xi, yi;
-
-  svbool_t pg32 = pg_all32;
-  svbool_t pg64_b = pg_all64, pg64_t = pg_all64;
 
   const size_t kStep = VecTraits<float>::num_lanes();
 
@@ -86,13 +80,16 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     svfloat32_t ty = svmla_x(pg_all32, ty0, vx, T3);
 
     // Calculate coordinates into the source image
-    xf = svmul_f32_x(pg_all32, tx, iw);
-    yf = svmul_f32_x(pg_all32, ty, iw);
+    return svcreate2(svmul_f32_x(pg_all32, tx, iw),
+                     svmul_f32_x(pg_all32, ty, iw));
   };
 
   auto calculate_nearest_coordinates = [&](size_t x) {
-    calculate_coordinates(x);
+    svfloat32x2_t coords = calculate_coordinates(x);
+    svfloat32_t xf = svget2(coords, 0);
+    svfloat32_t yf = svget2(coords, 1);
 
+    svuint32_t xi, yi;
     if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
       // Round to the nearest integer
       xi = svreinterpret_u32_s32(
@@ -109,69 +106,59 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
                    svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)),
                    sv_ymax);
     }
+    return svcreate2(xi, yi);
   };
 
-  auto load_small = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
-    svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
-    return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets);
-  };
-
-  auto load_large = [&](svbool_t pg_b, svbool_t pg_t, svuint32_t x,
-                        svuint32_t y) {
-    // Calculate offsets from coordinates (y * stride + x)
-    // To avoid losing precision, the final offsets should be in 64 bits
-    svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride);
-    svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride);
-    // Copy pixels from source
-    svuint64_t result_b =
-        svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
-    svuint64_t result_t =
-        svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
-    return svtrn1_u32(svreinterpret_u32_u64(result_b),
-                      svreinterpret_u32_u64(result_t));
-  };
-
-  auto get_pixels_or_border = [&](svuint32_t x, svuint32_t y) {
-    svbool_t in_range = svand_b_z(pg32, svcmple_u32(pg32, x, sv_xmax),
-                                  svcmple_u32(pg32, y, sv_ymax));
-
-    svuint32_t result;
+  auto load = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
     if constexpr (IsLarge) {
-      svbool_t pg_b = in_range;
-      svbool_t pg_t = svtrn2_b32(in_range, svpfalse());
-      result = load_large(pg_b, pg_t, x, y);
+      svbool_t pg_b = pg;
+      svbool_t pg_t = svtrn2_b32(pg, svpfalse());
+
+      // Calculate offsets from coordinates (y * stride + x)
+      // To avoid losing precision, the final offsets should be in 64 bits
+      svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride);
+      svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride);
+      // Copy pixels from source
+      svuint64_t result_b =
+          svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
+      svuint64_t result_t =
+          svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
+      return svtrn1_u32(svreinterpret_u32_u64(result_b),
+                        svreinterpret_u32_u64(result_t));
     } else {
-      result = load_small(in_range, x, y);
+      svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
+      return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets);
     }
+  };
 
+  auto get_pixels_or_border = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
+    svbool_t in_range =
+        svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
+    svuint32_t result = load(in_range, x, y);
     // Select between source pixels and border colour
     return svsel_u32(in_range, result, sv_border);
   };
 
   auto vector_path_nearest_4x = [&](size_t x, Columns<ScalarType> dst) {
-    auto load_source = [&](svuint32_t x, svuint32_t y) {
+    auto load_source = [&](svuint32x2_t coords) {
+      svuint32_t x = svget2(coords, 0);
+      svuint32_t y = svget2(coords, 1);
       if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
-        return get_pixels_or_border(x, y);
-      } else if constexpr (IsLarge) {
-        return load_large(pg_all64, pg_all64, x, y);
+        return get_pixels_or_border(pg_all32, x, y);
       } else {
-        return load_small(pg_all32, x, y);
+        return load(pg_all32, x, y);
       }
     };
     ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
-    calculate_nearest_coordinates(x);
-    svuint32_t res32_0 = load_source(xi, yi);
+    svuint32_t res32_0 = load_source(calculate_nearest_coordinates(x));
     x += kStep;
-    calculate_nearest_coordinates(x);
-    svuint32_t res32_1 = load_source(xi, yi);
+    svuint32_t res32_1 = load_source(calculate_nearest_coordinates(x));
     svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
                                     svreinterpret_u16_u32(res32_1));
     x += kStep;
-    calculate_nearest_coordinates(x);
-    res32_0 = load_source(xi, yi);
+    res32_0 = load_source(calculate_nearest_coordinates(x));
     x += kStep;
-    calculate_nearest_coordinates(x);
-    res32_1 = load_source(xi, yi);
+    res32_1 = load_source(calculate_nearest_coordinates(x));
     svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
                                     svreinterpret_u16_u32(res32_1));
     svuint8_t result =
@@ -182,31 +169,29 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
   auto vector_path_nearest_tail = [&](size_t x, size_t x_max,
                                       Columns<ScalarType> dst) {
     size_t length = x_max - x;
-    pg64_b = svwhilelt_b64(0ULL, (length + 1) / 2);
-    pg64_t = svwhilelt_b64(0ULL, length / 2);
-    pg32 = svwhilelt_b32(0ULL, length);
-    calculate_nearest_coordinates(x);
-    svuint32_t result;
+    svbool_t pg32 = svwhilelt_b32(0ULL, length);
 
+    svuint32x2_t coords = calculate_nearest_coordinates(x);
+    svuint32_t xi = svget2(coords, 0);
+    svuint32_t yi = svget2(coords, 1);
+
+    svuint32_t result;
     if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
-      result = get_pixels_or_border(xi, yi);
+      result = get_pixels_or_border(pg32, xi, yi);
     } else {
-      // To avoid losing precision, the final indices use 64 bits
-      result = load_large(pg64_b, pg64_t, xi, yi);
+      result = load(pg32, xi, yi);
     }
     svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
   };
 
-  auto calculate_linear_replicate = [&](uint32_t x) {
+  auto calculate_linear_replicate = [&](svbool_t pg, uint32_t x) {
     auto load_source = [&](svuint32_t x, svuint32_t y) {
-      if constexpr (IsLarge) {
-        return load_large(pg64_b, pg64_t, x, y);
-      } else {
-        return load_small(pg32, x, y);
-      }
+      return load(pg, x, y);
     };
 
-    calculate_coordinates(x);
+    svfloat32x2_t coords = calculate_coordinates(x);
+    svfloat32_t xf = svget2(coords, 0);
+    svfloat32_t yf = svget2(coords, 1);
     // Take the integer part, clamp it to within the dimensions of the
     // source image (negative values are already saturated to 0)
     svuint32_t x0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, xf, xmaxf));
@@ -245,56 +230,54 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
         svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F)));
   };
 
-  auto calculate_linear_constant_border = [&](uint32_t x) {
-    calculate_coordinates(x);
+  auto calculate_linear_constant_border = [&](svbool_t pg, uint32_t x) {
+    svfloat32x2_t coords = calculate_coordinates(x);
+    svfloat32_t xf = svget2(coords, 0);
+    svfloat32_t yf = svget2(coords, 1);
 
     // Convert obviously out-of-range coordinates to values that are just beyond
     // the largest permitted image width & height. This avoids the need for
     // special case handling elsewhere.
     svfloat32_t big = svdup_n_f32(1 << 24);
-    xf = svsel_f32(svcmple_f32(pg_all32, svabs_f32_x(pg_all32, xf), big), xf,
-                   big);
-    yf = svsel_f32(svcmple_f32(pg_all32, svabs_f32_x(pg_all32, yf), big), yf,
-                   big);
-
-    svfloat32_t xf0 = svrintm_f32_x(pg_all32, xf);
-    svfloat32_t yf0 = svrintm_f32_x(pg_all32, yf);
-
-    svint32_t x0 = svcvt_s32_x(pg_all32, xf0);
-    svint32_t y0 = svcvt_s32_x(pg_all32, yf0);
-    svint32_t x1 = svadd_s32_x(pg_all32, x0, svdup_n_s32(1));
-    svint32_t y1 = svadd_s32_x(pg_all32, y0, svdup_n_s32(1));
-
-    svfloat32_t xfrac = svsub_f32_x(pg_all32, xf, xf0);
-    svfloat32_t yfrac = svsub_f32_x(pg_all32, yf, yf0);
-
-    svfloat32_t a = svcvt_f32_u32_x(
-        pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x0),
-                                       svreinterpret_u32_s32(y0)));
-    svfloat32_t b = svcvt_f32_u32_x(
-        pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x1),
-                                       svreinterpret_u32_s32(y0)));
-    svfloat32_t line0 =
-        svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
-    svfloat32_t c = svcvt_f32_u32_x(
-        pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x0),
-                                       svreinterpret_u32_s32(y1)));
-    svfloat32_t d = svcvt_f32_u32_x(
-        pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x1),
-                                       svreinterpret_u32_s32(y1)));
-    svfloat32_t line1 =
-        svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
-    svfloat32_t result = svmla_f32_x(
-        pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
-    return svcvt_u32_f32_x(pg_all32, svrinta_f32_x(pg_all32, result));
+    xf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, xf), big), xf, big);
+    yf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, yf), big), yf, big);
+
+    svfloat32_t xf0 = svrintm_f32_x(pg, xf);
+    svfloat32_t yf0 = svrintm_f32_x(pg, yf);
+
+    svint32_t x0 = svcvt_s32_x(pg, xf0);
+    svint32_t y0 = svcvt_s32_x(pg, yf0);
+    svint32_t x1 = svadd_s32_x(pg, x0, svdup_n_s32(1));
+    svint32_t y1 = svadd_s32_x(pg, y0, svdup_n_s32(1));
+
+    svfloat32_t xfrac = svsub_f32_x(pg, xf, xf0);
+    svfloat32_t yfrac = svsub_f32_x(pg, yf, yf0);
+
+    svfloat32_t a =
+        svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x0),
+                                                 svreinterpret_u32_s32(y0)));
+    svfloat32_t b =
+        svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x1),
+                                                 svreinterpret_u32_s32(y0)));
+    svfloat32_t line0 = svmla_f32_x(pg, a, svsub_f32_x(pg, b, a), xfrac);
+    svfloat32_t c =
+        svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x0),
+                                                 svreinterpret_u32_s32(y1)));
+    svfloat32_t d =
+        svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x1),
+                                                 svreinterpret_u32_s32(y1)));
+    svfloat32_t line1 = svmla_f32_x(pg, c, svsub_f32_x(pg, d, c), xfrac);
+    svfloat32_t result =
+        svmla_f32_x(pg, line0, svsub_f32_x(pg, line1, line0), yfrac);
+    return svcvt_u32_f32_x(pg, svrinta_f32_x(pg, result));
   };
 
-  auto calculate_linear = [&](uint32_t x) {
+  auto calculate_linear = [&](svbool_t pg, uint32_t x) {
     if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
-      return calculate_linear_replicate(x);
+      return calculate_linear_replicate(pg, x);
     } else {
       static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
-      return calculate_linear_constant_border(x);
+      return calculate_linear_constant_border(pg, x);
     }
   };
 
@@ -308,9 +291,6 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5]));
     tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8]));
 
-    pg32 = pg_all32;
-    pg64_b = pg64_t = pg_all64;
-
     LoopUnroll2 loop{dst_width, kStep};
     if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
       loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); });
@@ -322,15 +302,15 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) {
       loop.unroll_four_times([&](size_t x) {
         ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
-        svuint32_t res0 = calculate_linear(x);
+        svuint32_t res0 = calculate_linear(pg_all32, x);
         x += kStep;
-        svuint32_t res1 = calculate_linear(x);
+        svuint32_t res1 = calculate_linear(pg_all32, x);
         svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0),
                                            svreinterpret_u16_u32(res1));
         x += kStep;
-        res0 = calculate_linear(x);
+        res0 = calculate_linear(pg_all32, x);
         x += kStep;
-        res1 = calculate_linear(x);
+        res1 = calculate_linear(pg_all32, x);
         svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0),
                                            svreinterpret_u16_u32(res1));
         svst1_u8(svptrue_b8(), p_dst,
@@ -339,18 +319,13 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
       });
       loop.unroll_once([&](size_t x) {
         ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
-        svuint32_t result = calculate_linear(x);
+        svuint32_t result = calculate_linear(pg_all32, x);
         svst1b_u32(pg_all32, p_dst, result);
       });
       loop.remaining([&](size_t x, size_t x_max) {
         ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
-        size_t length = x_max - x;
-        pg32 = svwhilelt_b32(0ULL, length);
-        if constexpr (IsLarge) {
-          pg64_b = svwhilelt_b64(0ULL, (length + 1) / 2);
-          pg64_t = svwhilelt_b64(0ULL, length / 2);
-        }
-        svuint32_t result = calculate_linear(x);
+        svbool_t pg32 = svwhilelt_b32(x, x_max);
+        svuint32_t result = calculate_linear(pg32, x);
         svst1b_u32(pg32, p_dst, result);
       });
     } else {
-- 
GitLab