From 1629fdf91e7a7a971abd8db8b03376ff84846719 Mon Sep 17 00:00:00 2001
From: Denes Tarjan <denes.tarjan@arm.com>
Date: Tue, 25 Feb 2025 15:53:05 +0000
Subject: [PATCH] Simplify, beautify and unify common parts of Remap_F32 and
 WarpPerspective

Zero-size (width or height) source images are not supported anymore.
Division by zero is now not replaced with 0, but is left NaN.
---
 kleidicv/include/kleidicv/kleidicv.h          |    3 +-
 kleidicv/include/kleidicv/transform/remap.h   |   14 -
 kleidicv/src/transform/remap_f32_neon.cpp     | 1236 ++---------------
 kleidicv/src/transform/remap_f32_sve2.cpp     |  321 +++++
 kleidicv/src/transform/remap_s16_sve2.cpp     |  277 ++++
 .../{remap_sc.h => remap_s16point5_sve2.cpp}  |  568 +-------
 kleidicv/src/transform/remap_sve2.cpp         |   84 --
 kleidicv/src/transform/transform_common.h     |   54 +
 kleidicv/src/transform/transform_neon.h       |  256 ++++
 .../{common_sc.h => transform_sve2.h}         |  126 +-
 .../src/transform/warp_perspective_neon.cpp   |  386 +----
 kleidicv/src/transform/warp_perspective_sc.h  |  299 ----
 .../src/transform/warp_perspective_sve2.cpp   |  270 +++-
 test/api/test_remap.cpp                       |   14 +-
 test/api/test_warp_perspective.cpp            |   40 +-
 15 files changed, 1426 insertions(+), 2522 deletions(-)
 create mode 100644 kleidicv/src/transform/remap_f32_sve2.cpp
 create mode 100644 kleidicv/src/transform/remap_s16_sve2.cpp
 rename kleidicv/src/transform/{remap_sc.h => remap_s16point5_sve2.cpp} (50%)
 delete mode 100644 kleidicv/src/transform/remap_sve2.cpp
 create mode 100644 kleidicv/src/transform/transform_common.h
 create mode 100644 kleidicv/src/transform/transform_neon.h
 rename kleidicv/src/transform/{common_sc.h => transform_sve2.h} (60%)
 delete mode 100644 kleidicv/src/transform/warp_perspective_sc.h
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index 294414163..2bed6acfd 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1840,7 +1840,8 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u16, const uint16_t *src,
 /// Width and height are the same for `mapx`, `mapy` and for `dst`. `src`
 /// dimensions may be different, but due to the limits of 32-bit float format,
 /// its width and height must be less than 2^24. Coordinates outside of `src`
-/// dimensions are considered border.
+/// dimensions are considered border. Zero width or height `src` is not
+/// supported.
 ///
 /// @param src            Pointer to the source data. Must be non-null.
 /// @param src_stride     Distance in bytes from the start of one row to the
diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h
index dcf43bd64..15b1654b6 100644
--- a/kleidicv/include/kleidicv/transform/remap.h
+++ b/kleidicv/include/kleidicv/transform/remap.h
@@ -142,20 +142,6 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
                            const T *border_value);
 }  // namespace sve2
 
-namespace sme2 {
-
-template <typename T>
-kleidicv_error_t remap_s16point5(const T *src, size_t src_stride,
-                                 size_t src_width, size_t src_height, T *dst,
-                                 size_t dst_stride, size_t dst_width,
-                                 size_t dst_height, size_t channels,
-                                 const int16_t *mapxy, size_t mapxy_stride,
-                                 const uint16_t *mapfrac, size_t mapfrac_stride,
-                                 kleidicv_border_type_t border_type,
-                                 const T *border_value);
-
-}  // namespace sme2
-
 }  // namespace kleidicv
 
 #endif  // KLEIDICV_REMAP_REMAP_H
diff --git a/kleidicv/src/transform/remap_f32_neon.cpp b/kleidicv/src/transform/remap_f32_neon.cpp
index ccfec490c..18f517c14 100644
--- a/kleidicv/src/transform/remap_f32_neon.cpp
+++ b/kleidicv/src/transform/remap_f32_neon.cpp
@@ -2,1094 +2,151 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include <arm_neon.h>
+
 #include <cassert>
-#include <type_traits>
 
-#include "kleidicv/kleidicv.h"
+#include "kleidicv/ctypes.h"
 #include "kleidicv/neon.h"
 #include "kleidicv/transform/remap.h"
+#include "transform_neon.h"
 
 namespace kleidicv::neon {
 
 template <typename ScalarType, bool IsLarge>
-class RemapF32Replicate;
-
-template <bool IsLarge>
-class RemapF32Replicate<uint8_t, IsLarge> {
- public:
-  using ScalarType = uint8_t;
-  using MapVecTraits = neon::VecTraits<float>;
-  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
-
-  RemapF32Replicate(Rows<const ScalarType> src_rows, size_t src_width,
-                    size_t src_height)
-      : src_rows_{src_rows},
-        v_src_stride_{vdup_n_u32(static_cast<uint32_t>(src_rows_.stride()))},
-        vq_src_stride_{vdupq_n_u32(static_cast<uint32_t>(src_rows_.stride()))},
-        v_xmax_{vdupq_n_u32(static_cast<uint32_t>(src_width - 1))},
-        v_ymax_{vdupq_n_u32(static_cast<uint32_t>(src_height - 1))} {}
-
-  void process_row(size_t width, Columns<const float> mapx,
-                   Columns<const float> mapy, Columns<ScalarType> dst) {
-    const size_t kStep = VecTraits<float>::num_lanes();
-
-    auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) {
-      uint32x4_t offset = vmlaq_u32(x, y, vq_src_stride_);
-      uint64_t acc =
-          static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
-          (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 32);
-      uint64x2_t rawsrc = vdupq_n_u64(acc);
-      acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) |
-            (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 32);
-      rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
-      return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
-    };
-
-    auto load_src_into_floats_large = [&](uint32x4_t x, uint32x4_t y) {
-      uint64x2_t offset_low =
-          vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), v_src_stride_);
-      uint64x2_t offset_high =
-          vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_stride_);
-      uint64_t acc =
-          static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_low, 0)]) |
-          (static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_low, 1)])
-           << 32);
-      uint64x2_t rawsrc = vdupq_n_u64(acc);
-      acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_high, 0)]) |
-            (static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_high, 1)])
-             << 32);
-      rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
-      return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
-    };
-
-    auto load = [&](uint32x4_t x, uint32x4_t y) {
-      if constexpr (IsLarge) {
-        return load_src_into_floats_large(x, y);
-      } else {
-        return load_src_into_floats_small(x, y);
-      }
-    };
-
-    auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) {
-      MapVectorType x = vld1q_f32(ptr_mapx);
-      MapVectorType y = vld1q_f32(ptr_mapy);
-      // Truncating convert to int
-      uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(x), v_xmax_);
-      uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(y), v_ymax_);
-
-      // Get fractional part, or 0 if out of range
-      float32x4_t zero = vdupq_n_f32(0.F);
-      uint32x4_t x_in_range =
-          vandq_u32(vcgeq_f32(x, zero), vcltq_u32(x0, v_xmax_));
-      uint32x4_t y_in_range =
-          vandq_u32(vcgeq_f32(y, zero), vcltq_u32(y0, v_ymax_));
-      float32x4_t xfrac =
-          vbslq_f32(x_in_range, vsubq_f32(x, vrndmq_f32(x)), zero);
-      float32x4_t yfrac =
-          vbslq_f32(y_in_range, vsubq_f32(y, vrndmq_f32(y)), zero);
-
-      // x1 = x0 + 1, except if it's already xmax or out of range
-      uint32x4_t x1 = vsubq_u32(x0, x_in_range);
-      uint32x4_t y1 = vsubq_u32(y0, y_in_range);
-
-      // Calculate offsets from coordinates (y * stride + x)
-      // a: top left, b: top right, c: bottom left, d: bottom right
-      float32x4_t a = load(x0, y0);
-      float32x4_t b = load(x1, y0);
-      float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
-      float32x4_t c = load(x0, y1);
-      float32x4_t d = load(x1, y1);
-      float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
-      float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
-      return vminq_u32(vdupq_n_u32(0xFF), vcvtaq_u32_f32(result));
-    };
-
-    auto vector_path_4 = [&](size_t step) {  // step = 4*4 = 16
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy);
-
-      ptr_mapx += kStep;
-      ptr_mapy += kStep;
-      uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy);
-      uint16x8_t result16_0 = vuzp1q_u16(res0, res1);
-
-      ptr_mapx += kStep;
-      ptr_mapy += kStep;
-      res0 = vector_path_1(ptr_mapx, ptr_mapy);
-
-      ptr_mapx += kStep;
-      ptr_mapy += kStep;
-      res1 = vector_path_1(ptr_mapx, ptr_mapy);
-      uint16x8_t result16_1 = vuzp1q_u16(res0, res1);
-      vst1q_u8(&dst[0], vuzp1q_u8(result16_0, result16_1));
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
-    };
-
-    LoopUnroll loop{width, kStep};
-    loop.unroll_four_times(vector_path_4);
-    loop.unroll_once([&](size_t step) {
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
-      dst[0] = vgetq_lane_u32(result, 0);
-      dst[1] = vgetq_lane_u32(result, 1);
-      dst[2] = vgetq_lane_u32(result, 2);
-      dst[3] = vgetq_lane_u32(result, 3);
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
-    });
-    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
-                          static_cast<ptrdiff_t>(loop.remaining_length());
-    mapx -= back_step;
-    mapy -= back_step;
-    dst -= back_step;
-    loop.remaining([&](size_t, size_t) {
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
-      dst[0] = vgetq_lane_u32(result, 0);
-      dst[1] = vgetq_lane_u32(result, 1);
-      dst[2] = vgetq_lane_u32(result, 2);
-      dst[3] = vgetq_lane_u32(result, 3);
-    });
-  }
-
- private:
-  Rows<const ScalarType> src_rows_;
-  uint32x2_t v_src_stride_;   // load_large
-  uint32x4_t vq_src_stride_;  // load_small
-  uint32x4_t v_xmax_;
-  uint32x4_t v_ymax_;
-};  // end of class RemapF32Replicate<uint8_t>
-
-template <bool IsLarge>
-class RemapF32Replicate<uint16_t, IsLarge> {
- public:
-  using ScalarType = uint16_t;
-  using MapVecTraits = neon::VecTraits<float>;
-  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
-
-  RemapF32Replicate(Rows<const ScalarType> src_rows, size_t src_width,
-                    size_t src_height)
-      : src_rows_{src_rows},
-        v_src_element_stride_{vdup_n_u32(
-            static_cast<uint32_t>(src_rows_.stride() / sizeof(ScalarType)))},
-        vq_src_element_stride_{vdupq_n_u32(
-            static_cast<uint32_t>(src_rows_.stride() / sizeof(ScalarType)))},
-        v_xmax_{vdupq_n_u32(static_cast<uint32_t>(src_width - 1))},
-        v_ymax_{vdupq_n_u32(static_cast<uint32_t>(src_height - 1))} {}
-
-  void process_row(size_t width, Columns<const float> mapx,
-                   Columns<const float> mapy, Columns<ScalarType> dst) {
-    const size_t kStep = VecTraits<float>::num_lanes();
-
-    auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) {
-      uint32x4_t offset = vmlaq_u32(x, y, vq_src_element_stride_);
-      uint64_t acc =
-          static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
-          (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 32);
-      uint64x2_t rawsrc = vdupq_n_u64(acc);
-      acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) |
-            (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 32);
-      rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
-      return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
-    };
-
-    auto load_src_into_floats_large = [&](uint32x4_t x, uint32x4_t y) {
-      uint64x2_t offset_low = vmlal_u32(vmovl_u32(vget_low_u32(x)),
-                                        vget_low_u32(y), v_src_element_stride_);
-      uint64x2_t offset_high =
-          vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_element_stride_);
-      uint64_t acc =
-          static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_low, 0)]) |
-          (static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_low, 1)])
-           << 32);
-      uint64x2_t rawsrc = vdupq_n_u64(acc);
-      acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_high, 0)]) |
-            (static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_high, 1)])
-             << 32);
-      rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
-      return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
-    };
-
-    auto load = [&](uint32x4_t x, uint32x4_t y) {
-      if constexpr (IsLarge) {
-        return load_src_into_floats_large(x, y);
-      } else {
-        return load_src_into_floats_small(x, y);
-      }
-    };
-
-    auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) {
-      MapVectorType x = vld1q_f32(ptr_mapx);
-      MapVectorType y = vld1q_f32(ptr_mapy);
-      // Truncating convert to int
-      uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(x), v_xmax_);
-      uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(y), v_ymax_);
-
-      // Get fractional part, or 0 if out of range
-      float32x4_t zero = vdupq_n_f32(0.F);
-      uint32x4_t x_in_range =
-          vandq_u32(vcgeq_f32(x, zero), vcltq_u32(x0, v_xmax_));
-      uint32x4_t y_in_range =
-          vandq_u32(vcgeq_f32(y, zero), vcltq_u32(y0, v_ymax_));
-      float32x4_t xfrac =
-          vbslq_f32(x_in_range, vsubq_f32(x, vrndmq_f32(x)), zero);
-      float32x4_t yfrac =
-          vbslq_f32(y_in_range, vsubq_f32(y, vrndmq_f32(y)), zero);
-
-      // x1 = x0 + 1, except if it's already xmax or out of range
-      uint32x4_t x1 = vsubq_u32(x0, x_in_range);
-      uint32x4_t y1 = vsubq_u32(y0, y_in_range);
-
-      // Calculate offsets from coordinates (y * stride + x)
-      // a: top left, b: top right, c: bottom left, d: bottom right
-      float32x4_t a = load(x0, y0);
-      float32x4_t b = load(x1, y0);
-      float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
-      float32x4_t c = load(x0, y1);
-      float32x4_t d = load(x1, y1);
-      float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
-      float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
-      return vminq_u32(vdupq_n_u32(0xFFFF), vcvtaq_u32_f32(result));
-    };
-
-    auto vector_path_2 = [&](size_t step) {  // step = 2*4 = 8
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy);
-
-      ptr_mapx += kStep;
-      ptr_mapy += kStep;
-      uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy);
-      uint16x8_t result16 = vuzp1q_u16(res0, res1);
-
-      vst1q_u16(&dst[0], result16);
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
-    };
-
-    LoopUnroll loop{width, kStep};
-    loop.unroll_twice(vector_path_2);
-    loop.unroll_once([&](size_t step) {
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
-      uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result));
-      vst1_u16(&dst[0], result16);
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
-    });
-    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
-                          static_cast<ptrdiff_t>(loop.remaining_length());
-    mapx -= back_step;
-    mapy -= back_step;
-    dst -= back_step;
-    loop.remaining([&](size_t, size_t) {
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
-      uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result));
-      vst1_u16(&dst[0], result16);
-    });
-  }
-
- private:
-  Rows<const ScalarType> src_rows_;
-  uint32x2_t v_src_element_stride_;   // load_large
-  uint32x4_t vq_src_element_stride_;  // load_small
-  uint32x4_t v_xmax_;
-  uint32x4_t v_ymax_;
-};  // end of class RemapF32Replicate<uint16_t>
+void remap_f32_nearest_replicate(
+    uint32x4_t v_xmax, uint32x4_t v_ymax, uint32x4_t v_src_stride,
+    Rows<const ScalarType> src_rows, Columns<ScalarType> dst, size_t dst_width,
+    Columns<const float> mapx, Columns<const float> mapy, const size_t kStep) {
+  LoopUnroll2<TryToAvoidTailLoop> loop{dst_width, kStep};
+  loop.unroll_once([&](size_t x) {
+    transform_pixels_replicate<ScalarType, IsLarge>(
+        vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x]), v_xmax, v_ymax, v_src_stride,
+        src_rows, dst.at(x));
+  });
+}
 
 template <typename ScalarType, bool IsLarge>
-class RemapF32ConstantBorder;
-
-// TODO: Need to refactor to reduce the complexity
-// NOLINTBEGIN(readability-function-cognitive-complexity)
-template <bool IsLarge>
-class RemapF32ConstantBorder<uint8_t, IsLarge> {
- public:
-  using ScalarType = uint8_t;
-  using MapVecTraits = neon::VecTraits<float>;
-  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
-
-  RemapF32ConstantBorder(Rows<const ScalarType> src_rows, size_t src_width,
-                         size_t src_height, const ScalarType *border_value)
-      : src_rows_{src_rows},
-        src_width_{src_width},
-        src_height_{src_height},
-        border_value_{border_value} {}
-
-  void process_row(size_t width, Columns<const float> mapx,
-                   Columns<const float> mapy, Columns<ScalarType> dst) {
-    const size_t kStep = VecTraits<float>::num_lanes();
-
-    auto get_edge_pixels =
-        [&](unsigned &a_result, unsigned &b_result, unsigned &c_result,
-            unsigned &d_result, int x0, int y0, ptrdiff_t offset,
-            Rows<const ScalarType> src_rows, int src_width, int src_height) {
-          if (y0 >= 0) {
-            if (x0 >= 0) {
-              a_result = src_rows[offset];
-            }
-            if (x0 + 1 < src_width) {
-              b_result = src_rows[offset + 1];
-            }
-          }
-          if (y0 + 1 < src_height) {
-            offset += static_cast<ptrdiff_t>(src_rows.stride());
-            if (x0 >= 0) {
-              c_result = src_rows[offset];
-            }
-            if (x0 + 1 < src_width) {
-              d_result = src_rows[offset + 1];
-            }
-          }
-        };
-
-    auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) {
-      MapVectorType xf = vld1q_f32(ptr_mapx);
-      MapVectorType yf = vld1q_f32(ptr_mapy);
-      // Convert obviously out-of-range coordinates to values that are just
-      // beyond the largest permitted image width & height. This avoids the need
-      // for special case handling elsewhere.
-      float32x4_t big = vdupq_n_f32(1 << 24);
-      xf = vbslq_f32(vcleq_f32(vabsq_f32(xf), big), xf, big);
-      yf = vbslq_f32(vcleq_f32(vabsq_f32(yf), big), yf, big);
-
-      int32x4_t x0 = vcvtmq_s32_f32(xf);
-      int32x4_t y0 = vcvtmq_s32_f32(yf);
-      int x0_array[4], y0_array[4];
-      unsigned a_array[4], b_array[4], c_array[4], d_array[4];
-      vst1q_s32(x0_array, x0);
-      vst1q_s32(y0_array, y0);
-      for (int i = 0; i < 4; ++i) {
-        int x0i = x0_array[i];
-        int y0i = y0_array[i];
-        ptrdiff_t offset = x0i + y0i * src_rows_.stride();
-
-        // src_width < (1ULL << 24) && src_height_ < (1ULL << 24) is guaranteed
-        if (x0i < 0 || x0i + 1 >= static_cast<int>(src_width_) || y0i < 0 ||
-            y0i + 1 >= static_cast<int>(src_height_)) {
-          // Not entirely within the source image
-
-          a_array[i] = b_array[i] = c_array[i] = d_array[i] = border_value_[0];
-
-          if (x0i < -1 || x0i >= static_cast<int>(src_width_) || y0i < -1 ||
-              y0i >= static_cast<int>(src_height_)) {
-            // Completely outside the source image
-            continue;
-          }
-
-          get_edge_pixels(a_array[i], b_array[i], c_array[i], d_array[i], x0i,
-                          y0i, offset, src_rows_, src_width_, src_height_);
-          continue;
-        }
-
-        // Completely inside the source image
-        a_array[i] = src_rows_[offset];
-        b_array[i] = src_rows_[offset + 1];
-        offset += src_rows_.stride();
-        c_array[i] = src_rows_[offset];
-        d_array[i] = src_rows_[offset + 1];
-      }
+void remap_f32_nearest_constant(uint32x4_t v_xmax, uint32x4_t v_ymax,
+                                uint32x4_t v_src_stride,
+                                Rows<const ScalarType> src_rows,
+                                Columns<ScalarType> dst, size_t dst_width,
+                                Columns<const float> mapx,
+                                Columns<const float> mapy, const size_t kStep,
+                                ScalarType border_value) {
+  LoopUnroll2<TryToAvoidTailLoop> loop{dst_width, kStep};
+  loop.unroll_once([&](size_t x) {
+    transform_pixels_constant<ScalarType, IsLarge>(
+        vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x]), v_xmax, v_ymax, v_src_stride,
+        src_rows, dst.at(x), border_value);
+  });
+}
 
-      float32x4_t xfrac = vsubq_f32(xf, vrndmq_f32(xf));
-      float32x4_t yfrac = vsubq_f32(yf, vrndmq_f32(yf));
-      float32x4_t a = vcvtq_f32_u32(vld1q_u32(a_array));
-      float32x4_t b = vcvtq_f32_u32(vld1q_u32(b_array));
-      float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
-      float32x4_t c = vcvtq_f32_u32(vld1q_u32(c_array));
-      float32x4_t d = vcvtq_f32_u32(vld1q_u32(d_array));
-      float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
-      float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
-      return vcvtaq_u32_f32(result);
-    };
+template <typename ScalarType, bool IsLarge, kleidicv_border_type_t Border>
+void remap_f32_linear(uint32x4_t v_xmax, uint32x4_t v_ymax,
+                      uint32x4_t v_src_stride, Rows<const ScalarType> src_rows,
+                      Columns<ScalarType> dst, size_t dst_width,
+                      Columns<const float> mapx, Columns<const float> mapy,
+                      const size_t kStep, ScalarType border_value) {
+  auto load_xy = [&](size_t x) {
+    return FloatVectorPair{vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x])};
+  };
+
+  auto vector_path = [&](size_t x) {
+    float32x4_t a, b, c, d, xfrac, yfrac;
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
+      load_quad_pixels_replicate<ScalarType, IsLarge>(
+          load_xy(x), v_xmax, v_ymax, v_src_stride, src_rows, xfrac, yfrac, a,
+          b, c, d);
+    } else {
+      static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
+      load_quad_pixels_constant<ScalarType, IsLarge>(
+          load_xy(x), v_xmax, v_ymax, v_src_stride, border_value, src_rows,
+          xfrac, yfrac, a, b, c, d);
+    }
+    return lerp_2d(xfrac, yfrac, a, b, c, d);
+  };
 
-    auto vector_path_4 = [&](size_t step) {  // step = 4*4 = 16
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy);
+  LoopUnroll2<TryToAvoidTailLoop> loop{dst_width, kStep};
 
-      ptr_mapx += kStep;
-      ptr_mapy += kStep;
-      uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy);
+  if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+    loop.unroll_four_times([&](size_t x) {
+      ScalarType *p_dst = &dst[x];
+      uint32x4_t res0 = vector_path(x);
+      x += kStep;
+      uint32x4_t res1 = vector_path(x);
       uint16x8_t result16_0 = vuzp1q_u16(res0, res1);
 
-      ptr_mapx += kStep;
-      ptr_mapy += kStep;
-      res0 = vector_path_1(ptr_mapx, ptr_mapy);
-
-      ptr_mapx += kStep;
-      ptr_mapy += kStep;
-      res1 = vector_path_1(ptr_mapx, ptr_mapy);
+      x += kStep;
+      res0 = vector_path(x);
+      x += kStep;
+      res1 = vector_path(x);
       uint16x8_t result16_1 = vuzp1q_u16(res0, res1);
-      vst1q_u8(&dst[0], vuzp1q_u8(result16_0, result16_1));
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
-    };
 
-    LoopUnroll loop{width, kStep};
-    loop.unroll_four_times(vector_path_4);
-    loop.unroll_once([&](size_t step) {
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
-      dst[0] = vgetq_lane_u32(result, 0);
-      dst[1] = vgetq_lane_u32(result, 1);
-      dst[2] = vgetq_lane_u32(result, 2);
-      dst[3] = vgetq_lane_u32(result, 3);
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
+      vst1q_u8(p_dst, vuzp1q_u8(result16_0, result16_1));
     });
-    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
-                          static_cast<ptrdiff_t>(loop.remaining_length());
-    mapx -= back_step;
-    mapy -= back_step;
-    dst -= back_step;
-    loop.remaining([&](size_t, size_t) {
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
-      dst[0] = vgetq_lane_u32(result, 0);
-      dst[1] = vgetq_lane_u32(result, 1);
-      dst[2] = vgetq_lane_u32(result, 2);
-      dst[3] = vgetq_lane_u32(result, 3);
+    loop.unroll_once([&](size_t x) {
+      uint32x4_t result = vector_path(x);
+      dst[x] = vgetq_lane_u32(result, 0);
+      dst[x + 1] = vgetq_lane_u32(result, 1);
+      dst[x + 2] = vgetq_lane_u32(result, 2);
+      dst[x + 3] = vgetq_lane_u32(result, 3);
     });
-  }
-
- private:
-  Rows<const ScalarType> src_rows_;
-  size_t src_width_;
-  size_t src_height_;
-  const ScalarType *border_value_;
-};  // end of class RemapF32ConstantBorder<uint8_t>
-// NOLINTEND(readability-function-cognitive-complexity)
-
-// TODO: Need to refactor to reduce the complexity
-// NOLINTBEGIN(readability-function-cognitive-complexity)
-template <bool IsLarge>
-class RemapF32ConstantBorder<uint16_t, IsLarge> {
- public:
-  using ScalarType = uint16_t;
-  using MapVecTraits = neon::VecTraits<float>;
-  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
-
-  RemapF32ConstantBorder(Rows<const ScalarType> src_rows, size_t src_width,
-                         size_t src_height, const ScalarType *border_value)
-      : src_rows_{src_rows},
-        src_width_{src_width},
-        src_height_{src_height},
-        border_value_{border_value} {}
-
-  void process_row(size_t width, Columns<const float> mapx,
-                   Columns<const float> mapy, Columns<ScalarType> dst) {
-    const size_t kStep = VecTraits<float>::num_lanes();
-
-    auto get_edge_pixels = [&](unsigned &a_result, unsigned &b_result,
-                               unsigned &c_result, unsigned &d_result, int x0,
-                               int y0, ptrdiff_t index,
-                               Rows<const ScalarType> src_rows, int src_width,
-                               int src_height) {
-      if (y0 >= 0) {
-        if (x0 >= 0) {
-          a_result = src_rows[index];
-        }
-        if (x0 + 1 < src_width) {
-          b_result = src_rows[index + 1];
-        }
-      }
-      if (y0 + 1 < src_height) {
-        index += static_cast<ptrdiff_t>(src_rows.stride() / sizeof(ScalarType));
-        if (x0 >= 0) {
-          c_result = src_rows[index];
-        }
-        if (x0 + 1 < src_width) {
-          d_result = src_rows[index + 1];
-        }
-      }
-    };
-
-    auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) {
-      MapVectorType xf = vld1q_f32(ptr_mapx);
-      MapVectorType yf = vld1q_f32(ptr_mapy);
-      // Convert obviously out-of-range coordinates to values that are just
-      // beyond the largest permitted image width & height. This avoids the need
-      // for special case handling elsewhere.
-      float32x4_t big = vdupq_n_f32(1 << 24);
-      xf = vbslq_f32(vcleq_f32(vabsq_f32(xf), big), xf, big);
-      yf = vbslq_f32(vcleq_f32(vabsq_f32(yf), big), yf, big);
-
-      int32x4_t x0 = vcvtmq_s32_f32(xf);
-      int32x4_t y0 = vcvtmq_s32_f32(yf);
-      int x0_array[4], y0_array[4];
-      unsigned a_array[4], b_array[4], c_array[4], d_array[4];
-      vst1q_s32(x0_array, x0);
-      vst1q_s32(y0_array, y0);
-      for (int i = 0; i < 4; ++i) {
-        int x0i = x0_array[i];
-        int y0i = y0_array[i];
-        ptrdiff_t index =
-            x0i + y0i * static_cast<ptrdiff_t>(src_rows_.stride() /
-                                               sizeof(ScalarType));
-        //        std::cout << "x0i " << x0i << "   y0i " << y0i << "   index: "
-        //        << index
-        // xw                  << "\n";
-        // src_width < (1ULL << 24) && src_height_ < (1ULL << 24) is guaranteed
-        if (x0i < 0 || x0i + 1 >= static_cast<int>(src_width_) || y0i < 0 ||
-            y0i + 1 >= static_cast<int>(src_height_)) {
-          // Not entirely within the source image
-
-          a_array[i] = b_array[i] = c_array[i] = d_array[i] = border_value_[0];
-
-          if (x0i < -1 || x0i >= static_cast<int>(src_width_) || y0i < -1 ||
-              y0i >= static_cast<int>(src_height_)) {
-            // Completely outside the source image
-            continue;
-          }
-
-          get_edge_pixels(a_array[i], b_array[i], c_array[i], d_array[i], x0i,
-                          y0i, index, src_rows_, src_width_, src_height_);
-          continue;
-        }
-
-        // Completely inside the source image
-        a_array[i] = src_rows_[index];
-        b_array[i] = src_rows_[index + 1];
-        index += src_rows_.stride() / sizeof(ScalarType);
-        c_array[i] = src_rows_[index];
-        d_array[i] = src_rows_[index + 1];
-      }
-
-      float32x4_t xfrac = vsubq_f32(xf, vrndmq_f32(xf));
-      float32x4_t yfrac = vsubq_f32(yf, vrndmq_f32(yf));
-      float32x4_t a = vcvtq_f32_u32(vld1q_u32(a_array));
-      float32x4_t b = vcvtq_f32_u32(vld1q_u32(b_array));
-      float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
-      float32x4_t c = vcvtq_f32_u32(vld1q_u32(c_array));
-      float32x4_t d = vcvtq_f32_u32(vld1q_u32(d_array));
-      float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
-      float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
-      return vcvtaq_u32_f32(result);
-    };
-
-    auto vector_path_2 = [&](size_t step) {  // step = 2*4 = 8
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy);
-
-      ptr_mapx += kStep;
-      ptr_mapy += kStep;
-      uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy);
-      uint16x8_t result16 = vuzp1q_u16(res0, res1);
-
-      vst1q_u16(&dst[0], result16);
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
-    };
-
-    LoopUnroll loop{width, kStep};
-    loop.unroll_twice(vector_path_2);
-    loop.unroll_once([&](size_t step) {
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
-      uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result));
-      vst1_u16(&dst[0], result16);
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
+  } else if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+    loop.unroll_twice([&](size_t x) {
+      ScalarType *p_dst = &dst[x];
+      uint32x4_t res0 = vector_path(x);
+      x += kStep;
+      uint32x4_t res1 = vector_path(x);
+      vst1q_u16(p_dst, vuzp1q_u16(res0, res1));
     });
-    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
-                          static_cast<ptrdiff_t>(loop.remaining_length());
-    mapx -= back_step;
-    mapy -= back_step;
-    dst -= back_step;
-    loop.remaining([&](size_t, size_t) {
-      const float *ptr_mapx = &mapx[0];
-      const float *ptr_mapy = &mapy[0];
-      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
+    loop.unroll_once([&](size_t x) {
+      uint32x4_t result = vector_path(x);
       uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result));
-      vst1_u16(&dst[0], result16);
+      vst1_u16(&dst[x], result16);
     });
   }
+}
 
- private:
-  Rows<const ScalarType> src_rows_;
-  size_t src_width_;
-  size_t src_height_;
-  const ScalarType *border_value_;
-};  // end of class RemapF32ConstantBorder<uint16_t>
-// NOLINTEND(readability-function-cognitive-complexity)
-
-template <typename ScalarType, bool IsLarge>
-class RemapF32NearestReplicate;
-
-template <bool IsLarge>
-class RemapF32NearestReplicate<uint8_t, IsLarge> {
- public:
-  using ScalarType = uint8_t;
-  using MapVecTraits = neon::VecTraits<float>;
-  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
-
-  RemapF32NearestReplicate(Rows<const ScalarType> src_rows, size_t src_width,
-                           size_t src_height)
-      : src_rows_{src_rows},
-        v_src_stride_{vdupq_n_u32(static_cast<uint32_t>(src_rows_.stride()))},
-        v_xmax_{vdupq_n_u32(static_cast<uint32_t>(src_width - 1))},
-        v_ymax_{vdupq_n_u32(static_cast<uint32_t>(src_height - 1))} {}
-
-  void get_map_coordinates(Columns<const float> mapx, Columns<const float> mapy,
-                           uint32x4_t &x, uint32x4_t &y) {
-    MapVectorType x_raw = vld1q_f32(&mapx[0]);
-    MapVectorType y_raw = vld1q_f32(&mapy[0]);
-
-    MapVectorType bias = vdupq_n_f32(0.5F);
-    // Round to nearest positive value
-    uint32x4_t x_nearest = vcvtmq_u32_f32(vaddq_f32(x_raw, bias));
-    uint32x4_t y_nearest = vcvtmq_u32_f32(vaddq_f32(y_raw, bias));
-
-    // Clamp coordinates to within the dimensions of the source image
-    x = vmaxq_u32(vdupq_n_u32(0), vminq_u32(x_nearest, v_xmax_));
-    y = vmaxq_u32(vdupq_n_u32(0), vminq_u32(y_nearest, v_ymax_));
-  }
-
-  uint8x8_t load_pixels_large(uint32x4_t x, uint32x4_t y) {
-    // Calculate offsets from coordinates (y * stride + x)
-    uint64x2_t indices_low =
-        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
-                  vget_low_u32(v_src_stride_));
-    uint64x2_t indices_high =
-        vmlal_high_u32(vmovl_high_u32(x), y, v_src_stride_);
-
-    uint8x8_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)],
-                        src_rows_[vgetq_lane_u64(indices_low, 1)],
-                        src_rows_[vgetq_lane_u64(indices_high, 0)],
-                        src_rows_[vgetq_lane_u64(indices_high, 1)],
-                        0,
-                        0,
-                        0,
-                        0};
-    return pixels;
-  }
-
-  uint8x8_t load_pixels_small(uint32x4_t x, uint32x4_t y) {
-    // Calculate offsets from coordinates (y * stride + x)
-    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride_);
-
-    uint8x8_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)],
-                        src_rows_[vgetq_lane_u32(indices, 1)],
-                        src_rows_[vgetq_lane_u32(indices, 2)],
-                        src_rows_[vgetq_lane_u32(indices, 3)],
-                        0,
-                        0,
-                        0,
-                        0};
-    return pixels;
-  }
-
-  void store_pixels(uint8x8_t pixels, Columns<ScalarType> dst) {
-    dst[0] = vget_lane_u8(pixels, 0);
-    dst[1] = vget_lane_u8(pixels, 1);
-    dst[2] = vget_lane_u8(pixels, 2);
-    dst[3] = vget_lane_u8(pixels, 3);
-  }
-
-  void process_row(size_t width, Columns<const float> mapx,
-                   Columns<const float> mapy, Columns<ScalarType> dst) {
-    const size_t kStep = VecTraits<float>::num_lanes();
-
-    auto vector_path = [&](size_t step) {
-      uint32x4_t x, y;
-      get_map_coordinates(mapx, mapy, x, y);
-
-      uint8x8_t pixels;
-      if constexpr (IsLarge) {
-        pixels = load_pixels_large(x, y);
-      } else {
-        pixels = load_pixels_small(x, y);
-      }
-
-      store_pixels(pixels, dst);
-
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
-    };
-
-    LoopUnroll loop{width, kStep};
-    loop.unroll_once(vector_path);
-    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
-                          static_cast<ptrdiff_t>(loop.remaining_length());
-    mapx -= back_step;
-    mapy -= back_step;
-    dst -= back_step;
-    loop.remaining([&](size_t, size_t step) { vector_path(step); });
-  }
-
- private:
-  Rows<const ScalarType> src_rows_;
-  uint32x4_t v_src_stride_;
-  uint32x4_t v_xmax_;
-  uint32x4_t v_ymax_;
-};  // end of class RemapF32NearestReplicate<uint8_t>
-
-template <bool IsLarge>
-class RemapF32NearestReplicate<uint16_t, IsLarge> {
- public:
-  using ScalarType = uint16_t;
-  using MapVecTraits = neon::VecTraits<float>;
-  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
-
-  RemapF32NearestReplicate(Rows<const ScalarType> src_rows, size_t src_width,
-                           size_t src_height)
-      : src_rows_{src_rows},
-        v_src_element_stride_{vdupq_n_u32(
-            static_cast<uint32_t>(src_rows_.stride() / sizeof(ScalarType)))},
-        v_xmax_{vdupq_n_u32(static_cast<uint32_t>(src_width - 1))},
-        v_ymax_{vdupq_n_u32(static_cast<uint32_t>(src_height - 1))} {}
-
-  void get_map_coordinates(Columns<const float> mapx, Columns<const float> mapy,
-                           uint32x4_t &x, uint32x4_t &y) {
-    MapVectorType x_raw = vld1q_f32(&mapx[0]);
-    MapVectorType y_raw = vld1q_f32(&mapy[0]);
-
-    MapVectorType bias = vdupq_n_f32(0.5F);
-    // Round to nearest positive value
-    uint32x4_t x_nearest = vcvtmq_u32_f32(vaddq_f32(x_raw, bias));
-    uint32x4_t y_nearest = vcvtmq_u32_f32(vaddq_f32(y_raw, bias));
-
-    // Clamp coordinates to within the dimensions of the source image
-    x = vmaxq_u32(vdupq_n_u32(0), vminq_u32(x_nearest, v_xmax_));
-    y = vmaxq_u32(vdupq_n_u32(0), vminq_u32(y_nearest, v_ymax_));
-  }
-
-  uint16x4_t load_pixels_large(uint32x4_t x, uint32x4_t y) {
-    // Calculate offsets from coordinates (y * element_stride + x)
-    uint64x2_t indices_low =
-        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
-                  vget_low_u32(v_src_element_stride_));
-    uint64x2_t indices_high =
-        vmlal_high_u32(vmovl_high_u32(x), y, v_src_element_stride_);
-
-    uint16x4_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)],
-                         src_rows_[vgetq_lane_u64(indices_low, 1)],
-                         src_rows_[vgetq_lane_u64(indices_high, 0)],
-                         src_rows_[vgetq_lane_u64(indices_high, 1)]};
-    return pixels;
-  }
-
-  uint16x4_t load_pixels_small(uint32x4_t x, uint32x4_t y) {
-    // Calculate offsets from coordinates (y * element_stride + x)
-    uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride_);
-
-    uint16x4_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)],
-                         src_rows_[vgetq_lane_u32(indices, 1)],
-                         src_rows_[vgetq_lane_u32(indices, 2)],
-                         src_rows_[vgetq_lane_u32(indices, 3)]};
-    return pixels;
-  }
-
-  void store_pixels(uint16x4_t pixels, Columns<ScalarType> dst) {
-    vst1_u16(&dst[0], pixels);
-  }
-
-  void process_row(size_t width, Columns<const float> mapx,
-                   Columns<const float> mapy, Columns<ScalarType> dst) {
-    const size_t kStep = VecTraits<float>::num_lanes();
-
-    auto vector_path = [&](size_t step) {
-      uint32x4_t x, y;
-      get_map_coordinates(mapx, mapy, x, y);
-
-      uint16x4_t pixels;
-      if constexpr (IsLarge) {
-        pixels = load_pixels_large(x, y);
-      } else {
-        pixels = load_pixels_small(x, y);
-      }
-
-      store_pixels(pixels, dst);
-
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
-    };
-
-    LoopUnroll loop{width, kStep};
-    loop.unroll_once(vector_path);
-    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
-                          static_cast<ptrdiff_t>(loop.remaining_length());
-    mapx -= back_step;
-    mapy -= back_step;
-    dst -= back_step;
-    loop.remaining([&](size_t, size_t step) { vector_path(step); });
-  }
-
- private:
-  Rows<const ScalarType> src_rows_;
-  uint32x4_t v_src_element_stride_;
-  uint32x4_t v_xmax_;
-  uint32x4_t v_ymax_;
-};  // end of class RemapF32NearestReplicate<uint16_t>
-
-template <typename ScalarType, bool IsLarge>
-class RemapF32NearestConstant;
-
-template <bool IsLarge>
-class RemapF32NearestConstant<uint8_t, IsLarge> {
- public:
-  using ScalarType = uint8_t;
-  using MapVecTraits = neon::VecTraits<float>;
-  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
-
-  RemapF32NearestConstant(Rows<const ScalarType> src_rows, size_t src_width,
-                          size_t src_height, const ScalarType *border_value)
-      : src_rows_{src_rows},
-        v_src_stride_{vdupq_n_u32(static_cast<uint32_t>(src_rows_.stride()))},
-        v_width_{vdupq_n_u32(static_cast<uint32_t>(src_width))},
-        v_height_{vdupq_n_u32(static_cast<uint32_t>(src_height))},
-        v_border_{vdup_n_u8(*border_value)} {}
-
-  void get_map_coordinates(Columns<const float> mapx, Columns<const float> mapy,
-                           uint32x4_t &x, uint32x4_t &y, uint32x4_t &in_range) {
-    MapVectorType x_raw = vld1q_f32(&mapx[0]);
-    MapVectorType y_raw = vld1q_f32(&mapy[0]);
-
-    MapVectorType bias = vdupq_n_f32(0.5F);
-    float32x4_t x_biased = vaddq_f32(x_raw, bias);
-    float32x4_t y_biased = vaddq_f32(y_raw, bias);
-
-    // Round to nearest positive value
-    uint32x4_t x_nearest = vcvtmq_u32_f32(x_biased);
-    uint32x4_t y_nearest = vcvtmq_u32_f32(y_biased);
-
-    // Find whether coordinates are within the image dimensions.
-    uint32x4_t above_zero =
-        vandq_u32(vcgezq_f32(x_biased), vcgezq_f32(y_biased));
-    uint32x4_t below_limits = vandq_u32(vcltq_u32(x_nearest, v_width_),
-                                        vcltq_u32(y_nearest, v_height_));
-    in_range = vandq_u32(above_zero, below_limits);
-
-    // Zero out-of-range coordinates.
-    x = vandq_u32(in_range, x_nearest);
-    y = vandq_u32(in_range, y_nearest);
-  }
-
-  uint8x8_t load_pixels_large(uint32x4_t x, uint32x4_t y) {
-    // Calculate offsets from coordinates (y * stride + x)
-    uint64x2_t indices_low =
-        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
-                  vget_low_u32(v_src_stride_));
-    uint64x2_t indices_high =
-        vmlal_high_u32(vmovl_high_u32(x), y, v_src_stride_);
-
-    uint8x8_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)],
-                        src_rows_[vgetq_lane_u64(indices_low, 1)],
-                        src_rows_[vgetq_lane_u64(indices_high, 0)],
-                        src_rows_[vgetq_lane_u64(indices_high, 1)],
-                        0,
-                        0,
-                        0,
-                        0};
-    return pixels;
-  }
-
-  uint8x8_t load_pixels_small(uint32x4_t x, uint32x4_t y) {
-    // Calculate offsets from coordinates (y * stride + x)
-    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride_);
-
-    uint8x8_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)],
-                        src_rows_[vgetq_lane_u32(indices, 1)],
-                        src_rows_[vgetq_lane_u32(indices, 2)],
-                        src_rows_[vgetq_lane_u32(indices, 3)],
-                        0,
-                        0,
-                        0,
-                        0};
-    return pixels;
-  }
-
-  void store_pixels(uint8x8_t pixels, Columns<ScalarType> dst) {
-    dst[0] = vget_lane_u8(pixels, 0);
-    dst[1] = vget_lane_u8(pixels, 1);
-    dst[2] = vget_lane_u8(pixels, 2);
-    dst[3] = vget_lane_u8(pixels, 3);
-  }
-
-  void process_row(size_t width, Columns<const float> mapx,
-                   Columns<const float> mapy, Columns<ScalarType> dst) {
-    const size_t kStep = VecTraits<float>::num_lanes();
-
-    auto vector_path = [&](size_t step) {
-      uint32x4_t x, y;
-      uint32x4_t in_range;
-      get_map_coordinates(mapx, mapy, x, y, in_range);
-
-      uint8x8_t pixels;
-      if constexpr (IsLarge) {
-        pixels = load_pixels_large(x, y);
-      } else {
-        pixels = load_pixels_small(x, y);
-      }
-
-      // Select between source pixels and border colour
-      uint8x8_t in_range_narrowed =
-          vmovn_u16(vcombine_u16(vmovn_u32(in_range), vdup_n_u16(0)));
-      uint8x8_t pixels_or_border =
-          vbsl_u8(in_range_narrowed, pixels, v_border_);
-
-      store_pixels(pixels_or_border, dst);
-
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
-    };
-
-    LoopUnroll loop{width, kStep};
-    loop.unroll_once(vector_path);
-    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
-                          static_cast<ptrdiff_t>(loop.remaining_length());
-    mapx -= back_step;
-    mapy -= back_step;
-    dst -= back_step;
-    loop.remaining([&](size_t, size_t step) { vector_path(step); });
-  }
-
- private:
-  Rows<const ScalarType> src_rows_;
-  uint32x4_t v_src_stride_;
-  uint32x4_t v_width_;
-  uint32x4_t v_height_;
-  uint8x8_t v_border_;
-};  // end of class RemapF32NearestConstant<uint8_t>
-
-template <bool IsLarge>
-class RemapF32NearestConstant<uint16_t, IsLarge> {
- public:
-  using ScalarType = uint16_t;
-  using MapVecTraits = neon::VecTraits<float>;
-  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
-
-  RemapF32NearestConstant(Rows<const ScalarType> src_rows, size_t src_width,
-                          size_t src_height, const ScalarType *border_value)
-      : src_rows_{src_rows},
-        v_src_element_stride_{vdupq_n_u32(
-            static_cast<uint32_t>(src_rows_.stride() / sizeof(ScalarType)))},
-        v_width_{vdupq_n_u32(static_cast<uint32_t>(src_width))},
-        v_height_{vdupq_n_u32(static_cast<uint32_t>(src_height))},
-        v_border_{vdup_n_u16(*border_value)} {}
-
-  void get_map_coordinates(Columns<const float> mapx, Columns<const float> mapy,
-                           uint32x4_t &x, uint32x4_t &y, uint32x4_t &in_range) {
-    MapVectorType x_raw = vld1q_f32(&mapx[0]);
-    MapVectorType y_raw = vld1q_f32(&mapy[0]);
-
-    MapVectorType bias = vdupq_n_f32(0.5F);
-    float32x4_t x_biased = vaddq_f32(x_raw, bias);
-    float32x4_t y_biased = vaddq_f32(y_raw, bias);
-
-    // Round to nearest positive value
-    uint32x4_t x_nearest = vcvtmq_u32_f32(x_biased);
-    uint32x4_t y_nearest = vcvtmq_u32_f32(y_biased);
-
-    // Find whether coordinates are within the image dimensions.
-    uint32x4_t above_zero =
-        vandq_u32(vcgezq_f32(x_biased), vcgezq_f32(y_biased));
-    uint32x4_t below_limits = vandq_u32(vcltq_u32(x_nearest, v_width_),
-                                        vcltq_u32(y_nearest, v_height_));
-    in_range = vandq_u32(above_zero, below_limits);
-
-    // Zero out-of-range coordinates.
-    x = vandq_u32(in_range, x_nearest);
-    y = vandq_u32(in_range, y_nearest);
-  }
-
-  uint16x4_t load_pixels_large(uint32x4_t x, uint32x4_t y) {
-    // Calculate offsets from coordinates (y * stride + x)
-    uint64x2_t indices_low =
-        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
-                  vget_low_u32(v_src_element_stride_));
-    uint64x2_t indices_high =
-        vmlal_high_u32(vmovl_high_u32(x), y, v_src_element_stride_);
-
-    uint16x4_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)],
-                         src_rows_[vgetq_lane_u64(indices_low, 1)],
-                         src_rows_[vgetq_lane_u64(indices_high, 0)],
-                         src_rows_[vgetq_lane_u64(indices_high, 1)]};
-    return pixels;
-  }
-
-  uint16x4_t load_pixels_small(uint32x4_t x, uint32x4_t y) {
-    // Calculate offsets from coordinates (y * stride + x)
-    uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride_);
-
-    uint16x4_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)],
-                         src_rows_[vgetq_lane_u32(indices, 1)],
-                         src_rows_[vgetq_lane_u32(indices, 2)],
-                         src_rows_[vgetq_lane_u32(indices, 3)]};
-    return pixels;
-  }
-
-  void store_pixels(uint16x4_t pixels, Columns<ScalarType> dst) {
-    vst1_u16(&dst[0], pixels);
-  }
-
-  void process_row(size_t width, Columns<const float> mapx,
-                   Columns<const float> mapy, Columns<ScalarType> dst) {
-    const size_t kStep = VecTraits<float>::num_lanes();
-
-    auto vector_path = [&](size_t step) {
-      uint32x4_t x, y;
-      uint32x4_t in_range;
-      get_map_coordinates(mapx, mapy, x, y, in_range);
-
-      uint16x4_t pixels;
-      if constexpr (IsLarge) {
-        pixels = load_pixels_large(x, y);
+template <typename ScalarType, bool IsLarge,
+          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
+void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
+                         size_t src_height, const ScalarType *border_value,
+                         Rows<ScalarType> dst_rows, size_t dst_width,
+                         size_t y_begin, size_t y_end,
+                         Rows<const float> mapx_rows,
+                         Rows<const float> mapy_rows) {
+  uint32x4_t v_src_stride = vdupq_n_u32(
+      static_cast<uint32_t>(src_rows.stride() / sizeof(ScalarType)));
+  uint32x4_t v_xmax = vdupq_n_u32(static_cast<uint32_t>(src_width - 1));
+  uint32x4_t v_ymax = vdupq_n_u32(static_cast<uint32_t>(src_height - 1));
+  const size_t kStep = VecTraits<float>::num_lanes();
+
+  for (size_t y = y_begin; y < y_end; ++y) {
+    if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
+      if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
+        remap_f32_nearest_replicate<ScalarType, IsLarge>(
+            v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(),
+            dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep);
       } else {
-        pixels = load_pixels_small(x, y);
+        static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
+        remap_f32_nearest_constant<ScalarType, IsLarge>(
+            v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(),
+            dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep,
+            border_value[0]);
       }
-
-      // Select between source pixels and border colour
-      uint16x4_t in_range_narrowed = vmovn_u32(in_range);
-      uint16x4_t pixels_or_border =
-          vbsl_u16(in_range_narrowed, pixels, v_border_);
-
-      store_pixels(pixels_or_border, dst);
-
-      mapx += ptrdiff_t(step);
-      mapy += ptrdiff_t(step);
-      dst += ptrdiff_t(step);
-    };
-
-    LoopUnroll loop{width, kStep};
-    loop.unroll_once(vector_path);
-    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
-                          static_cast<ptrdiff_t>(loop.remaining_length());
-    mapx -= back_step;
-    mapy -= back_step;
-    dst -= back_step;
-    loop.remaining([&](size_t, size_t step) { vector_path(step); });
+    } else {
+      static_assert(Inter == KLEIDICV_INTERPOLATION_LINEAR);
+      remap_f32_linear<ScalarType, IsLarge, Border>(
+          v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(),
+          dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep,
+          Border == KLEIDICV_BORDER_TYPE_CONSTANT ? border_value[0] : 0);
+    }
+    ++mapx_rows;
+    ++mapy_rows;
+    ++dst_rows;
   }
-
- private:
-  Rows<const ScalarType> src_rows_;
-  uint32x4_t v_src_element_stride_;
-  uint32x4_t v_width_;
-  uint32x4_t v_height_;
-  uint16x4_t v_border_;
-};  // end of class RemapF32NearestConstant<uint16_t>
+}
 
 // Most of the complexity comes from parameter checking.
 // NOLINTBEGIN(readability-function-cognitive-complexity)
@@ -1101,8 +158,7 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
                            const float *mapy, size_t mapy_stride,
                            kleidicv_interpolation_type_t interpolation,
                            kleidicv_border_type_t border_type,
-                           [[maybe_unused]] const T *border_value) {
-  // may need to remove the maybe_unused
+                           const T *border_value) {
   CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
   CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
   CHECK_POINTER_AND_STRIDE(mapx, mapx_stride, dst_height);
@@ -1120,7 +176,9 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
 
   // Calculating in float32_t will only be precise until 24 bits
   if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
-      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24)) {
+      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) ||
+      // Empty source image is not supported
+      src_width == 0 || src_height == 0) {
     return KLEIDICV_ERROR_RANGE;
   }
 
@@ -1130,52 +188,10 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
   Rows<T> dst_rows{dst, dst_stride, channels};
   Rectangle rect{dst_width, dst_height};
 
-  if (interpolation == KLEIDICV_INTERPOLATION_LINEAR) {
-    if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
-      if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
-        RemapF32ConstantBorder<T, true> operation{src_rows, src_width,
-                                                  src_height, border_value};
-        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
-      } else {
-        RemapF32ConstantBorder<T, false> operation{src_rows, src_width,
-                                                   src_height, border_value};
-        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
-      }
-    } else {
-      assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
-      if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
-        RemapF32Replicate<T, true> operation{src_rows, src_width, src_height};
-        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
-      } else {
-        RemapF32Replicate<T, false> operation{src_rows, src_width, src_height};
-        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
-      }
-    }
-  } else {
-    assert(interpolation == KLEIDICV_INTERPOLATION_NEAREST);
-    if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
-      if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
-        RemapF32NearestConstant<T, true> operation{src_rows, src_width,
-                                                   src_height, border_value};
-        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
-      } else {
-        RemapF32NearestConstant<T, false> operation{src_rows, src_width,
-                                                    src_height, border_value};
-        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
-      }
-    } else {
-      assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
-      if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
-        RemapF32NearestReplicate<T, true> operation{src_rows, src_width,
-                                                    src_height};
-        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
-      } else {
-        RemapF32NearestReplicate<T, false> operation{src_rows, src_width,
-                                                     src_height};
-        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
-      }
-    }
-  }
+  transform_operation<T>(is_image_large(src_rows, src_height), interpolation,
+                         border_type, src_rows, src_width, src_height,
+                         border_value, dst_rows, dst_width, 0, dst_height,
+                         mapx_rows, mapy_rows);
 
   return KLEIDICV_OK;
 }
diff --git a/kleidicv/src/transform/remap_f32_sve2.cpp b/kleidicv/src/transform/remap_f32_sve2.cpp
new file mode 100644
index 000000000..e4ad3e724
--- /dev/null
+++ b/kleidicv/src/transform/remap_f32_sve2.cpp
@@ -0,0 +1,321 @@
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arm_sve.h>
+
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+#include "kleidicv/sve2.h"
+#include "kleidicv/transform/remap.h"
+#include "transform_sve2.h"
+
+namespace kleidicv::sve2 {
+
+template <typename ScalarType, bool IsLarge, kleidicv_border_type_t Border>
+void remap_f32_nearest(svuint32_t sv_xmax, svuint32_t sv_ymax,
+                       svuint32_t sv_src_stride,
+                       Rows<const ScalarType> src_rows, svuint32_t sv_border,
+                       Columns<ScalarType> dst, size_t kStep, size_t dst_width,
+                       Rows<const float> mapx_rows,
+                       Rows<const float> mapy_rows) {
+  svbool_t pg_all32 = svptrue_b32();
+  auto load_coords = [&](svbool_t pg, size_t xs) {
+    auto x = static_cast<ptrdiff_t>(xs);
+    return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]),
+                     svld1_f32(pg, &mapy_rows.as_columns()[x]));
+  };
+
+  auto get_pixels = [&](svbool_t pg, svuint32x2_t coords) {
+    svuint32_t x = svget2(coords, 0);
+    svuint32_t y = svget2(coords, 1);
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+      svbool_t in_range = svand_b_z(pg, svcmple_u32(pg, x, sv_xmax),
+                                    svcmple_u32(pg, y, sv_ymax));
+      svuint32_t result =
+          load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows);
+      // Select between source pixels and border colour
+      return svsel_u32(in_range, result, sv_border);
+    } else {
+      static_assert(Border == KLEIDICV_BORDER_TYPE_REPLICATE);
+      return load_xy<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows);
+    }
+  };
+
+  auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) {
+    svfloat32x2_t coords = load_coords(pg32, x);
+    svfloat32_t xf = svget2(coords, 0);
+    svfloat32_t yf = svget2(coords, 1);
+
+    svuint32_t xi, yi;
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+      // Convert coordinates to integers.
+      // Negative numbers will become large positive numbers.
+      // Since the source width and height is known to be <=2^24 these large
+      // positive numbers will always be treated as outside the source image
+      // bounds.
+      xi = svreinterpret_u32_s32(
+          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf)));
+      yi = svreinterpret_u32_s32(
+          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf)));
+    } else {
+      // Round to the nearest integer, clamp it to within the dimensions of
+      // the source image (negative values are already saturated to 0)
+      xi = svmin_x(pg_all32,
+                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)),
+                   sv_xmax);
+      yi = svmin_x(pg_all32,
+                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)),
+                   sv_ymax);
+    }
+    return svcreate2(xi, yi);
+  };
+
+  LoopUnroll2 loop{dst_width, kStep};
+
+  if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+    auto vector_path_generic = [&](size_t x, size_t x_max,
+                                   Columns<ScalarType> dst) {
+      size_t length = x_max - x;
+      svbool_t pg32 = svwhilelt_b32(0ULL, length);
+      svuint32_t result =
+          get_pixels(pg32, calculate_nearest_coordinates(pg32, x));
+      svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
+    };
+
+    loop.unroll_four_times([&](size_t x) {
+      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+      svuint32_t res32_0 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      x += kStep;
+      svuint32_t res32_1 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                      svreinterpret_u16_u32(res32_1));
+      x += kStep;
+      res32_0 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      x += kStep;
+      res32_1 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                      svreinterpret_u16_u32(res32_1));
+      svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0),
+                                   svreinterpret_u8_u16(result1));
+      svst1(svptrue_b8(), p_dst, result);
+    });
+    loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); });
+    loop.remaining(
+        [&](size_t x, size_t length) { vector_path_generic(x, length, dst); });
+  }
+
+  if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+    auto vector_path_generic = [&](size_t x, size_t x_max,
+                                   Columns<ScalarType> dst) {
+      size_t length = x_max - x;
+      svbool_t pg32 = svwhilelt_b32(0ULL, length);
+      svuint32_t result =
+          get_pixels(pg32, calculate_nearest_coordinates(pg32, x));
+      svst1h_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
+    };
+
+    loop.unroll_twice([&](size_t x) {
+      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+      svuint32_t res32_0 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      x += kStep;
+      svuint32_t res32_1 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      svuint16_t result = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                     svreinterpret_u16_u32(res32_1));
+      svst1(svptrue_b16(), p_dst, result);
+    });
+    loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); });
+    loop.remaining(
+        [&](size_t x, size_t length) { vector_path_generic(x, length, dst); });
+  }
+}
+
+template <typename ScalarType, bool IsLarge, kleidicv_border_type_t Border>
+void remap_f32_linear(svuint32_t sv_xmax, svuint32_t sv_ymax,
+                      svfloat32_t sv_xmaxf, svfloat32_t sv_ymaxf,
+                      svuint32_t sv_src_stride, Rows<const ScalarType> src_rows,
+                      svuint32_t sv_border, Columns<ScalarType> dst,
+                      size_t kStep, size_t dst_width,
+                      Rows<const float> mapx_rows,
+                      Rows<const float> mapy_rows) {
+  auto load_coords = [&](svbool_t pg, size_t xs) {
+    auto x = static_cast<ptrdiff_t>(xs);
+    return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]),
+                     svld1_f32(pg, &mapy_rows.as_columns()[x]));
+  };
+
+  auto calculate_linear = [&](svbool_t pg, uint32_t x) {
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
+      svfloat32x2_t coords = load_coords(pg, x);
+      return calculate_linear_replicated_border<ScalarType, IsLarge>(
+          pg, coords, sv_xmaxf, sv_ymaxf, sv_src_stride, src_rows);
+    } else {
+      static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
+      svfloat32x2_t coords = load_coords(pg, x);
+      return calculate_linear_constant_border<ScalarType, IsLarge>(
+          pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
+    }
+  };
+
+  auto store_vector = [](svbool_t pg32, ScalarType* p_dst, svuint32_t result) {
+    if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+      svst1b_u32(pg32, p_dst, result);
+    }
+    if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+      svst1h_u32(pg32, p_dst, result);
+    }
+  };
+
+  svbool_t pg_all32 = svptrue_b32();
+  LoopUnroll2 loop{dst_width, kStep};
+  if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+    loop.unroll_four_times([&](size_t x) {
+      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+      svuint32_t res0 = calculate_linear(pg_all32, x);
+      x += kStep;
+      svuint32_t res1 = calculate_linear(pg_all32, x);
+      svuint16_t result16_0 =
+          svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1));
+      x += kStep;
+      res0 = calculate_linear(pg_all32, x);
+      x += kStep;
+      res1 = calculate_linear(pg_all32, x);
+      svuint16_t result16_1 =
+          svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1));
+      svst1_u8(svptrue_b8(), p_dst,
+               svuzp1_u8(svreinterpret_u8_u16(result16_0),
+                         svreinterpret_u8_u16(result16_1)));
+    });
+  } else if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+    loop.unroll_twice([&](size_t x) {
+      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+      svuint32_t res0 = calculate_linear(pg_all32, x);
+      x += kStep;
+      svuint32_t res1 = calculate_linear(pg_all32, x);
+      svuint16_t result16 =
+          svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1));
+      svst1_u16(svptrue_b16(), p_dst, result16);
+    });
+  }
+  loop.unroll_once([&](size_t x) {
+    svuint32_t result = calculate_linear(pg_all32, x);
+    store_vector(pg_all32, &dst[static_cast<ptrdiff_t>(x)], result);
+  });
+  loop.remaining([&](size_t x, size_t x_max) {
+    svbool_t pg32 = svwhilelt_b32(x, x_max);
+    svuint32_t result = calculate_linear(pg32, x);
+    store_vector(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
+  });
+}
+
+template <typename ScalarType, bool IsLarge,
+          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
+void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
+                         size_t src_height, const ScalarType* border_value,
+                         Rows<ScalarType> dst_rows, size_t dst_width,
+                         size_t y_begin, size_t y_end,
+                         Rows<const float> mapx_rows,
+                         Rows<const float> mapy_rows) {
+  svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
+  svuint32_t sv_ymax = svdup_n_u32(src_height - 1);
+  svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride());
+  svuint32_t sv_border = svdup_n_u32(0);
+
+  if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+    sv_border = svdup_n_u32(border_value[0]);
+  }
+
+  svfloat32_t sv_xmaxf = svdup_n_f32(static_cast<float>(src_width - 1));
+  svfloat32_t sv_ymaxf = svdup_n_f32(static_cast<float>(src_height - 1));
+
+  const size_t kStep = VecTraits<float>::num_lanes();
+
+  for (size_t y = y_begin; y < y_end; ++y) {
+    Columns<ScalarType> dst = dst_rows.as_columns();
+    if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
+      remap_f32_nearest<ScalarType, IsLarge, Border>(
+          sv_xmax, sv_ymax, sv_src_stride, src_rows, sv_border, dst, kStep,
+          dst_width, mapx_rows, mapy_rows);
+    } else {
+      static_assert(Inter == KLEIDICV_INTERPOLATION_LINEAR);
+      remap_f32_linear<ScalarType, IsLarge, Border>(
+          sv_xmax, sv_ymax, sv_xmaxf, sv_ymaxf, sv_src_stride, src_rows,
+          sv_border, dst, kStep, dst_width, mapx_rows, mapy_rows);
+    }
+    ++mapx_rows;
+    ++mapy_rows;
+    ++dst_rows;
+  }
+}
+
+// Most of the complexity comes from parameter checking.
+// NOLINTBEGIN(readability-function-cognitive-complexity)
+template <typename T>
+kleidicv_error_t remap_f32(const T* src, size_t src_stride, size_t src_width,
+                           size_t src_height, T* dst, size_t dst_stride,
+                           size_t dst_width, size_t dst_height, size_t channels,
+                           const float* mapx, size_t mapx_stride,
+                           const float* mapy, size_t mapy_stride,
+                           kleidicv_interpolation_type_t interpolation,
+                           kleidicv_border_type_t border_type,
+                           const T* border_value) {
+  CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
+  CHECK_POINTER_AND_STRIDE(mapx, mapx_stride, dst_height);
+  CHECK_POINTER_AND_STRIDE(mapy, mapy_stride, dst_height);
+  CHECK_IMAGE_SIZE(src_width, src_height);
+  CHECK_IMAGE_SIZE(dst_width, dst_height);
+  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
+    return KLEIDICV_ERROR_NULL_POINTER;
+  }
+
+  if (!remap_f32_is_implemented<T>(src_stride, src_width, src_height, dst_width,
+                                   border_type, channels, interpolation)) {
+    return KLEIDICV_ERROR_NOT_IMPLEMENTED;
+  }
+
+  // Calculating in float32_t will only be precise until 24 bits
+  if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
+      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) ||
+      // Empty source image is not supported
+      src_width == 0 || src_height == 0) {
+    return KLEIDICV_ERROR_RANGE;
+  }
+
+  Rows<const T> src_rows{src, src_stride, channels};
+  Rows<const float> mapx_rows{mapx, mapx_stride, 1};
+  Rows<const float> mapy_rows{mapy, mapy_stride, 1};
+  Rows<T> dst_rows{dst, dst_stride, channels};
+  Rectangle rect{dst_width, dst_height};
+
+  transform_operation<T>(is_image_large(src_rows, src_height), interpolation,
+                         border_type, src_rows, src_width, src_height,
+                         border_value, dst_rows, dst_width, 0, dst_height,
+                         mapx_rows, mapy_rows);
+
+  return KLEIDICV_OK;
+}
+// NOLINTEND(readability-function-cognitive-complexity)
+
+#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(type)                          \
+  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_f32<type>(          \
+      const type* src, size_t src_stride, size_t src_width, size_t src_height, \
+      type* dst, size_t dst_stride, size_t dst_width, size_t dst_height,       \
+      size_t channels, const float* mapx, size_t mapx_stride,                  \
+      const float* mapy, size_t mapy_stride,                                   \
+      kleidicv_interpolation_type_t interpolation,                             \
+      kleidicv_border_type_t border_type, const type* border_value)
+
+KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t);
+KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint16_t);
+
+}  // namespace kleidicv::sve2
diff --git a/kleidicv/src/transform/remap_s16_sve2.cpp b/kleidicv/src/transform/remap_s16_sve2.cpp
new file mode 100644
index 000000000..4d5442c91
--- /dev/null
+++ b/kleidicv/src/transform/remap_s16_sve2.cpp
@@ -0,0 +1,277 @@
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arm_sve.h>
+
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+#include "kleidicv/sve2.h"
+#include "kleidicv/transform/remap.h"
+#include "transform_sve2.h"
+
+namespace kleidicv::sve2 {
+
+template <typename ScalarType>
+class RemapS16Replicate {
+ public:
+  using MapVecTraits = VecTraits<int16_t>;
+  using MapVectorType = typename MapVecTraits::VectorType;
+  using MapVector2Type = typename MapVecTraits::Vector2Type;
+
+  RemapS16Replicate(Rows<const ScalarType> src_rows, size_t src_width,
+                    size_t src_height, svuint16_t& v_src_element_stride,
+                    MapVectorType& v_x_max, MapVectorType& v_y_max)
+      : src_rows_{src_rows},
+        v_src_element_stride_{v_src_element_stride},
+        v_xmax_{v_x_max},
+        v_ymax_{v_y_max} {
+    v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType));
+    v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1));
+    v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1));
+  }
+
+  void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b,
+                        svuint32_t offsets_t, svbool_t pg_t,
+                        Columns<ScalarType> dst);
+
+  void process_row(size_t width, Columns<const int16_t> mapxy,
+                   Columns<ScalarType> dst) {
+    svuint32_t offsets_b, offsets_t;
+    svint16_t svzero = svdup_n_s16(0);
+    auto load_offsets = [&](svbool_t pg) {
+      MapVector2Type xy = svld2_s16(pg, &mapxy[0]);
+      // Clamp coordinates to within the dimensions of the source image
+      svuint16_t x = svreinterpret_u16_s16(
+          svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 0), v_xmax_)));
+      svuint16_t y = svreinterpret_u16_s16(
+          svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 1), v_ymax_)));
+      // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
+      offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_);
+      offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_);
+    };
+
+    svbool_t pg_all16 = MapVecTraits::svptrue();
+    svbool_t pg_all32 = svptrue_b32();
+
+    auto gather_load_generic_vector_path = [&](svbool_t pg, ptrdiff_t step) {
+      load_offsets(pg);
+      svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2);
+      svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2);
+      transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t, dst);
+      mapxy += step;
+      dst += step;
+    };
+
+    // NOTE: gather load is not available in streaming mode
+    auto gather_load_full_vector_path = [&](ptrdiff_t step) {
+      load_offsets(pg_all16);
+      transform_pixels(pg_all16, offsets_b, pg_all32, offsets_t, pg_all32, dst);
+      mapxy += step;
+      dst += step;
+    };
+
+    LoopUnroll loop{width, MapVecTraits::num_lanes()};
+    loop.unroll_once([&](size_t step) {
+      gather_load_full_vector_path(static_cast<ptrdiff_t>(step));
+    });
+    loop.remaining([&](size_t length, size_t step) {
+      svbool_t pg = MapVecTraits::svwhilelt(step - length, step);
+      gather_load_generic_vector_path(pg, static_cast<ptrdiff_t>(length));
+    });
+  }
+
+ private:
+  Rows<const ScalarType> src_rows_;
+  svuint16_t& v_src_element_stride_;
+  MapVectorType& v_xmax_;
+  MapVectorType& v_ymax_;
+};  // end of class RemapS16Replicate<ScalarType>
+
+template <>
+void RemapS16Replicate<uint8_t>::transform_pixels(
+    svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
+    svbool_t pg_t, Columns<uint8_t> dst) {
+  // Copy pixels from source
+  svuint32_t result_b =
+      svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
+  svuint32_t result_t =
+      svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
+  svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
+                                 svreinterpret_u16_u32(result_t));
+
+  svst1b_u16(pg, &dst[0], result);
+}
+
+template <>
+void RemapS16Replicate<uint16_t>::transform_pixels(
+    svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
+    svbool_t pg_t, Columns<uint16_t> dst) {
+  // Account for the size of the source type when calculating offset
+  offsets_b = svlsl_n_u32_x(pg, offsets_b, 1);
+  offsets_t = svlsl_n_u32_x(pg, offsets_t, 1);
+
+  // Copy pixels from source
+  svuint32_t result_b =
+      svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
+  svuint32_t result_t =
+      svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
+  svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
+                                 svreinterpret_u16_u32(result_t));
+
+  svst1_u16(pg, &dst[0], result);
+}
+
+template <typename ScalarType>
+class RemapS16ConstantBorder {
+ public:
+  RemapS16ConstantBorder(Rows<const ScalarType> src_rows, size_t src_width,
+                         size_t src_height, const ScalarType* border_value,
+                         svuint16_t& v_src_element_stride, svuint16_t& v_width,
+                         svuint16_t& v_height, svuint16_t& v_border)
+      : src_rows_{src_rows},
+        v_src_element_stride_{v_src_element_stride},
+        v_width_{v_width},
+        v_height_{v_height},
+        v_border_{v_border} {
+    v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType));
+    v_width_ = svdup_u16(static_cast<uint16_t>(src_width));
+    v_height_ = svdup_u16(static_cast<uint16_t>(src_height));
+    v_border_ = svdup_u16(*border_value);
+  }
+
+  void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b,
+                        svuint32_t offsets_t, svbool_t pg_t, ScalarType* dst);
+
+  void process_row(size_t width, Columns<const int16_t> mapxy,
+                   Columns<ScalarType> dst) {
+    for (size_t i = 0; i < width; i += svcnth()) {
+      svbool_t pg = svwhilelt_b16(i, width);
+
+      svint16x2_t xy = svld2_s16(pg, &mapxy[static_cast<ptrdiff_t>(i * 2)]);
+      svuint16_t x = svreinterpret_u16_s16(svget2(xy, 0));
+      svuint16_t y = svreinterpret_u16_s16(svget2(xy, 1));
+
+      // Find whether coordinates are within the image dimensions.
+      svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_),
+                                    svcmplt_u16(pg, y, v_height_));
+      svbool_t pg_b = in_range;
+      svbool_t pg_t = svtrn2_b16(in_range, svpfalse());
+
+      // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
+      svuint32_t offsets_b =
+          svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_);
+      svuint32_t offsets_t =
+          svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_);
+
+      transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t,
+                       &dst[static_cast<ptrdiff_t>(i)]);
+    }
+  }
+
+ private:
+  Rows<const ScalarType> src_rows_;
+  svuint16_t& v_src_element_stride_;
+  svuint16_t& v_width_;
+  svuint16_t& v_height_;
+  svuint16_t& v_border_;
+};  // end of class RemapS16ConstantBorder<ScalarType>
+
+template <>
+void RemapS16ConstantBorder<uint8_t>::transform_pixels(
+    svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
+    svbool_t pg_t, uint8_t* dst) {
+  // Copy pixels from source
+  svuint32_t result_b =
+      svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
+  svuint32_t result_t =
+      svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
+
+  svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
+                                 svreinterpret_u16_u32(result_t));
+
+  svuint16_t result_selected = svsel(pg_b, result, v_border_);
+  svst1b_u16(pg, dst, result_selected);
+}
+
+template <>
+void RemapS16ConstantBorder<uint16_t>::transform_pixels(
+    svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
+    svbool_t pg_t, uint16_t* dst) {
+  // Account for the size of the source type when calculating offset
+  offsets_b = svlsl_n_u32_x(pg, offsets_b, 1);
+  offsets_t = svlsl_n_u32_x(pg, offsets_t, 1);
+
+  // Copy pixels from source
+  svuint32_t result_b =
+      svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
+  svuint32_t result_t =
+      svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
+
+  svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
+                                 svreinterpret_u16_u32(result_t));
+
+  svuint16_t result_selected = svsel(pg_b, result, v_border_);
+  svst1_u16(pg, dst, result_selected);
+}
+
+// Most of the complexity comes from parameter checking.
+// NOLINTBEGIN(readability-function-cognitive-complexity)
+template <typename T>
+kleidicv_error_t remap_s16(const T* src, size_t src_stride, size_t src_width,
+                           size_t src_height, T* dst, size_t dst_stride,
+                           size_t dst_width, size_t dst_height, size_t channels,
+                           const int16_t* mapxy, size_t mapxy_stride,
+                           kleidicv_border_type_t border_type,
+                           const T* border_value) {
+  CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
+  CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height);
+  CHECK_IMAGE_SIZE(src_width, src_height);
+  CHECK_IMAGE_SIZE(dst_width, dst_height);
+  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
+    return KLEIDICV_ERROR_NULL_POINTER;
+  }
+
+  if (!remap_s16_is_implemented<T>(src_stride, src_width, src_height, dst_width,
+                                   border_type, channels)) {
+    return KLEIDICV_ERROR_NOT_IMPLEMENTED;
+  }
+
+  Rows<const T> src_rows{src, src_stride, channels};
+  Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2};
+  Rows<T> dst_rows{dst, dst_stride, channels};
+  svuint16_t sv_src_element_stride;
+  Rectangle rect{dst_width, dst_height};
+  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
+    svuint16_t sv_width, sv_height, sv_border;
+    RemapS16ConstantBorder<T> operation{
+        src_rows, src_width, src_height, border_value, sv_src_element_stride,
+        sv_width, sv_height, sv_border};
+    zip_rows(operation, rect, mapxy_rows, dst_rows);
+  } else {
+    assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
+    svint16_t sv_xmax, sv_ymax;
+    RemapS16Replicate<T> operation{src_rows,   src_width,
+                                   src_height, sv_src_element_stride,
+                                   sv_xmax,    sv_ymax};
+    zip_rows(operation, rect, mapxy_rows, dst_rows);
+  }
+  return KLEIDICV_OK;
+}
+// NOLINTEND(readability-function-cognitive-complexity)
+
+#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type)                          \
+  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16<type>(          \
+      const type* src, size_t src_stride, size_t src_width, size_t src_height, \
+      type* dst, size_t dst_stride, size_t dst_width, size_t dst_height,       \
+      size_t channels, const int16_t* mapxy, size_t mapxy_stride,              \
+      kleidicv_border_type_t border_type, const type* border_value)
+
+KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint8_t);
+KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint16_t);
+
+}  // namespace kleidicv::sve2
diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_s16point5_sve2.cpp
similarity index 50%
rename from kleidicv/src/transform/remap_sc.h
rename to kleidicv/src/transform/remap_s16point5_sve2.cpp
index a0e27f169..489d37386 100644
--- a/kleidicv/src/transform/remap_sc.h
+++ b/kleidicv/src/transform/remap_s16point5_sve2.cpp
@@ -2,274 +2,18 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#ifndef KLEIDICV_REMAP_SC_H
-#define KLEIDICV_REMAP_SC_H
-
 #include <arm_sve.h>
 
-#include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
-#include <limits>
-#include <memory>
 
-#include "common_sc.h"
 #include "kleidicv/sve2.h"
 #include "kleidicv/transform/remap.h"
+#include "transform_sve2.h"
 
-namespace KLEIDICV_TARGET_NAMESPACE {
-
-template <typename ScalarType>
-class RemapS16Replicate {
- public:
-  using MapVecTraits = VecTraits<int16_t>;
-  using MapVectorType = typename MapVecTraits::VectorType;
-  using MapVector2Type = typename MapVecTraits::Vector2Type;
-
-  RemapS16Replicate(Rows<const ScalarType> src_rows, size_t src_width,
-                    size_t src_height, svuint16_t& v_src_element_stride,
-                    MapVectorType& v_x_max, MapVectorType& v_y_max)
-      : src_rows_{src_rows},
-        v_src_element_stride_{v_src_element_stride},
-        v_xmax_{v_x_max},
-        v_ymax_{v_y_max} {
-    v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType));
-    v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1));
-    v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1));
-  }
-
-  void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b,
-                        svuint32_t offsets_t, svbool_t pg_t,
-                        Columns<ScalarType> dst);
-
-  void process_row(size_t width, Columns<const int16_t> mapxy,
-                   Columns<ScalarType> dst) {
-    svuint32_t offsets_b, offsets_t;
-    svint16_t svzero = svdup_n_s16(0);
-    auto load_offsets = [&](svbool_t pg) {
-      MapVector2Type xy = svld2_s16(pg, &mapxy[0]);
-      // Clamp coordinates to within the dimensions of the source image
-      svuint16_t x = svreinterpret_u16_s16(
-          svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 0), v_xmax_)));
-      svuint16_t y = svreinterpret_u16_s16(
-          svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 1), v_ymax_)));
-      // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
-      offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_);
-      offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_);
-    };
-
-    svbool_t pg_all16 = MapVecTraits::svptrue();
-    svbool_t pg_all32 = svptrue_b32();
-
-    auto gather_load_generic_vector_path = [&](svbool_t pg, ptrdiff_t step) {
-      load_offsets(pg);
-      svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2);
-      svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2);
-      transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t, dst);
-      mapxy += step;
-      dst += step;
-    };
-
-    // NOTE: gather load is not available in streaming mode
-    auto gather_load_full_vector_path = [&](ptrdiff_t step) {
-      load_offsets(pg_all16);
-      transform_pixels(pg_all16, offsets_b, pg_all32, offsets_t, pg_all32, dst);
-      mapxy += step;
-      dst += step;
-    };
-
-    LoopUnroll loop{width, MapVecTraits::num_lanes()};
-    loop.unroll_once([&](size_t step) {
-      gather_load_full_vector_path(static_cast<ptrdiff_t>(step));
-    });
-    loop.remaining([&](size_t length, size_t step) {
-      svbool_t pg = MapVecTraits::svwhilelt(step - length, step);
-      gather_load_generic_vector_path(pg, static_cast<ptrdiff_t>(length));
-    });
-  }
-
- private:
-  Rows<const ScalarType> src_rows_;
-  svuint16_t& v_src_element_stride_;
-  MapVectorType& v_xmax_;
-  MapVectorType& v_ymax_;
-};  // end of class RemapS16Replicate<ScalarType>
-
-template <>
-void RemapS16Replicate<uint8_t>::transform_pixels(
-    svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
-    svbool_t pg_t, Columns<uint8_t> dst) {
-  // Copy pixels from source
-  svuint32_t result_b =
-      svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
-  svuint32_t result_t =
-      svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
-  svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
-                                 svreinterpret_u16_u32(result_t));
-
-  svst1b_u16(pg, &dst[0], result);
-}
-
-template <>
-void RemapS16Replicate<uint16_t>::transform_pixels(
-    svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
-    svbool_t pg_t, Columns<uint16_t> dst) {
-  // Account for the size of the source type when calculating offset
-  offsets_b = svlsl_n_u32_x(pg, offsets_b, 1);
-  offsets_t = svlsl_n_u32_x(pg, offsets_t, 1);
-
-  // Copy pixels from source
-  svuint32_t result_b =
-      svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
-  svuint32_t result_t =
-      svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
-  svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
-                                 svreinterpret_u16_u32(result_t));
-
-  svst1_u16(pg, &dst[0], result);
-}
-
-template <typename ScalarType>
-class RemapS16ConstantBorder {
- public:
-  RemapS16ConstantBorder(Rows<const ScalarType> src_rows, size_t src_width,
-                         size_t src_height, const ScalarType* border_value,
-                         svuint16_t& v_src_element_stride, svuint16_t& v_width,
-                         svuint16_t& v_height, svuint16_t& v_border)
-      : src_rows_{src_rows},
-        v_src_element_stride_{v_src_element_stride},
-        v_width_{v_width},
-        v_height_{v_height},
-        v_border_{v_border} {
-    v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType));
-    v_width_ = svdup_u16(static_cast<uint16_t>(src_width));
-    v_height_ = svdup_u16(static_cast<uint16_t>(src_height));
-    v_border_ = svdup_u16(*border_value);
-  }
-
-  void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b,
-                        svuint32_t offsets_t, svbool_t pg_t, ScalarType* dst);
-
-  void process_row(size_t width, Columns<const int16_t> mapxy,
-                   Columns<ScalarType> dst) {
-    for (size_t i = 0; i < width; i += svcnth()) {
-      svbool_t pg = svwhilelt_b16(i, width);
-
-      svint16x2_t xy = svld2_s16(pg, &mapxy[static_cast<ptrdiff_t>(i * 2)]);
-      svuint16_t x = svreinterpret_u16_s16(svget2(xy, 0));
-      svuint16_t y = svreinterpret_u16_s16(svget2(xy, 1));
-
-      // Find whether coordinates are within the image dimensions.
-      svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_),
-                                    svcmplt_u16(pg, y, v_height_));
-      svbool_t pg_b = in_range;
-      svbool_t pg_t = svtrn2_b16(in_range, svpfalse());
-
-      // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
-      svuint32_t offsets_b =
-          svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_);
-      svuint32_t offsets_t =
-          svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_);
-
-      transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t,
-                       &dst[static_cast<ptrdiff_t>(i)]);
-    }
-  }
-
- private:
-  Rows<const ScalarType> src_rows_;
-  svuint16_t& v_src_element_stride_;
-  svuint16_t& v_width_;
-  svuint16_t& v_height_;
-  svuint16_t& v_border_;
-};  // end of class RemapS16ConstantBorder<ScalarType>
-
-template <>
-void RemapS16ConstantBorder<uint8_t>::transform_pixels(
-    svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
-    svbool_t pg_t, uint8_t* dst) {
-  // Copy pixels from source
-  svuint32_t result_b =
-      svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
-  svuint32_t result_t =
-      svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
-
-  svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
-                                 svreinterpret_u16_u32(result_t));
-
-  svuint16_t result_selected = svsel(pg_b, result, v_border_);
-  svst1b_u16(pg, dst, result_selected);
-}
-
-template <>
-void RemapS16ConstantBorder<uint16_t>::transform_pixels(
-    svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
-    svbool_t pg_t, uint16_t* dst) {
-  // Account for the size of the source type when calculating offset
-  offsets_b = svlsl_n_u32_x(pg, offsets_b, 1);
-  offsets_t = svlsl_n_u32_x(pg, offsets_t, 1);
-
-  // Copy pixels from source
-  svuint32_t result_b =
-      svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
-  svuint32_t result_t =
-      svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
-
-  svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
-                                 svreinterpret_u16_u32(result_t));
-
-  svuint16_t result_selected = svsel(pg_b, result, v_border_);
-  svst1_u16(pg, dst, result_selected);
-}
-
-// Most of the complexity comes from parameter checking.
-// NOLINTBEGIN(readability-function-cognitive-complexity)
-template <typename T>
-kleidicv_error_t remap_s16_sc(const T* src, size_t src_stride, size_t src_width,
-                              size_t src_height, T* dst, size_t dst_stride,
-                              size_t dst_width, size_t dst_height,
-                              size_t channels, const int16_t* mapxy,
-                              size_t mapxy_stride,
-                              kleidicv_border_type_t border_type,
-                              const T* border_value) {
-  CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
-  CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
-  CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height);
-  CHECK_IMAGE_SIZE(src_width, src_height);
-  CHECK_IMAGE_SIZE(dst_width, dst_height);
-  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
-    return KLEIDICV_ERROR_NULL_POINTER;
-  }
-
-  if (!remap_s16_is_implemented<T>(src_stride, src_width, src_height, dst_width,
-                                   border_type, channels)) {
-    return KLEIDICV_ERROR_NOT_IMPLEMENTED;
-  }
-
-  Rows<const T> src_rows{src, src_stride, channels};
-  Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2};
-  Rows<T> dst_rows{dst, dst_stride, channels};
-  svuint16_t sv_src_element_stride;
-  Rectangle rect{dst_width, dst_height};
-  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
-    svuint16_t sv_width, sv_height, sv_border;
-    RemapS16ConstantBorder<T> operation{
-        src_rows, src_width, src_height, border_value, sv_src_element_stride,
-        sv_width, sv_height, sv_border};
-    zip_rows(operation, rect, mapxy_rows, dst_rows);
-  } else {
-    assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
-    svint16_t sv_xmax, sv_ymax;
-    RemapS16Replicate<T> operation{src_rows,   src_width,
-                                   src_height, sv_src_element_stride,
-                                   sv_xmax,    sv_ymax};
-    zip_rows(operation, rect, mapxy_rows, dst_rows);
-  }
-  return KLEIDICV_OK;
-}
-// NOLINTEND(readability-function-cognitive-complexity)
+namespace kleidicv::sve2 {
 
 template <typename ScalarType>
 inline svuint16_t interpolate_16point5(svbool_t pg, svuint16_t frac,
@@ -778,12 +522,14 @@ class RemapS16Point5ConstantBorder<uint16_t> {
 // Most of the complexity comes from parameter checking.
 // NOLINTBEGIN(readability-function-cognitive-complexity)
 template <typename T>
-kleidicv_error_t remap_s16point5_sc(
-    const T* src, size_t src_stride, size_t src_width, size_t src_height,
-    T* dst, size_t dst_stride, size_t dst_width, size_t dst_height,
-    size_t channels, const int16_t* mapxy, size_t mapxy_stride,
-    const uint16_t* mapfrac, size_t mapfrac_stride,
-    kleidicv_border_type_t border_type, const T* border_value) {
+kleidicv_error_t remap_s16point5(const T* src, size_t src_stride,
+                                 size_t src_width, size_t src_height, T* dst,
+                                 size_t dst_stride, size_t dst_width,
+                                 size_t dst_height, size_t channels,
+                                 const int16_t* mapxy, size_t mapxy_stride,
+                                 const uint16_t* mapfrac, size_t mapfrac_stride,
+                                 kleidicv_border_type_t border_type,
+                                 const T* border_value) {
   CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
   CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
   CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height);
@@ -821,291 +567,17 @@ kleidicv_error_t remap_s16point5_sc(
   }
   return KLEIDICV_OK;
 }
-
-template <typename ScalarType, bool IsLarge, kleidicv_border_type_t Border>
-void remap32f_nearest(svuint32_t sv_xmax, svuint32_t sv_ymax,
-                      svuint32_t sv_src_stride, Rows<const ScalarType> src_rows,
-                      svuint32_t sv_border, Columns<ScalarType> dst,
-                      size_t kStep, size_t dst_width,
-                      Rows<const float> mapx_rows,
-                      Rows<const float> mapy_rows) {
-  svbool_t pg_all32 = svptrue_b32();
-  auto load_coords = [&](svbool_t pg, size_t xs) {
-    auto x = static_cast<ptrdiff_t>(xs);
-    return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]),
-                     svld1_f32(pg, &mapy_rows.as_columns()[x]));
-  };
-
-  auto get_pixels = [&](svbool_t pg, svuint32x2_t coords) {
-    svuint32_t x = svget2(coords, 0);
-    svuint32_t y = svget2(coords, 1);
-    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
-      svbool_t in_range = svand_b_z(pg, svcmple_u32(pg, x, sv_xmax),
-                                    svcmple_u32(pg, y, sv_ymax));
-      svuint32_t result = load_common<ScalarType, IsLarge>(
-          in_range, x, y, sv_src_stride, src_rows);
-      // Select between source pixels and border colour
-      return svsel_u32(in_range, result, sv_border);
-    } else {
-      static_assert(Border == KLEIDICV_BORDER_TYPE_REPLICATE);
-      return load_common<ScalarType, IsLarge>(pg, x, y, sv_src_stride,
-                                              src_rows);
-    }
-  };
-
-  auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) {
-    svfloat32x2_t coords = load_coords(pg32, x);
-    svfloat32_t xf = svget2(coords, 0);
-    svfloat32_t yf = svget2(coords, 1);
-
-    svuint32_t xi, yi;
-    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
-      // Round to the nearest integer
-      xi = svreinterpret_u32_s32(
-          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf)));
-      yi = svreinterpret_u32_s32(
-          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf)));
-    } else {
-      // Round to the nearest integer, clamp it to within the dimensions of
-      // the source image (negative values are already saturated to 0)
-      xi = svmin_x(pg_all32,
-                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)),
-                   sv_xmax);
-      yi = svmin_x(pg_all32,
-                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)),
-                   sv_ymax);
-    }
-    return svcreate2(xi, yi);
-  };
-
-  LoopUnroll2 loop{dst_width, kStep};
-
-  if constexpr (std::is_same<ScalarType, uint8_t>::value) {
-    auto vector_path_generic = [&](size_t x, size_t x_max,
-                                   Columns<ScalarType> dst) {
-      size_t length = x_max - x;
-      svbool_t pg32 = svwhilelt_b32(0ULL, length);
-      svuint32_t result =
-          get_pixels(pg32, calculate_nearest_coordinates(pg32, x));
-      svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
-    };
-
-    loop.unroll_four_times([&](size_t x) {
-      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
-      svuint32_t res32_0 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      x += kStep;
-      svuint32_t res32_1 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                      svreinterpret_u16_u32(res32_1));
-      x += kStep;
-      res32_0 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      x += kStep;
-      res32_1 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                      svreinterpret_u16_u32(res32_1));
-      svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0),
-                                   svreinterpret_u8_u16(result1));
-      svst1(svptrue_b8(), p_dst, result);
-    });
-    loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); });
-    loop.remaining(
-        [&](size_t x, size_t length) { vector_path_generic(x, length, dst); });
-  }
-
-  if constexpr (std::is_same<ScalarType, uint16_t>::value) {
-    auto vector_path_generic = [&](size_t x, size_t x_max,
-                                   Columns<ScalarType> dst) {
-      size_t length = x_max - x;
-      svbool_t pg32 = svwhilelt_b32(0ULL, length);
-      svuint32_t result =
-          get_pixels(pg32, calculate_nearest_coordinates(pg32, x));
-      svst1h_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
-    };
-
-    loop.unroll_twice([&](size_t x) {
-      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
-      svuint32_t res32_0 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      x += kStep;
-      svuint32_t res32_1 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      svuint16_t result = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                     svreinterpret_u16_u32(res32_1));
-      svst1(svptrue_b16(), p_dst, result);
-    });
-    loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); });
-    loop.remaining(
-        [&](size_t x, size_t length) { vector_path_generic(x, length, dst); });
-  }
-}
-
-// TODO reduce functional complexity
-template <typename ScalarType, bool IsLarge,
-          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
-void remap32f_process_rows(Rows<const ScalarType> src_rows, size_t src_width,
-                           size_t src_height, const ScalarType* border_value,
-                           Rows<ScalarType> dst_rows, size_t dst_width,
-                           size_t y_begin, size_t y_end,
-                           Rows<const float> mapx_rows,
-                           Rows<const float> mapy_rows) {
-  svbool_t pg_all32 = svptrue_b32();
-  svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
-  svuint32_t sv_ymax = svdup_n_u32(src_height - 1);
-  svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride());
-  svuint32_t sv_border = svdup_n_u32(0);
-
-  if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
-    sv_border = svdup_n_u32(border_value[0]);
-  }
-
-  svfloat32_t xmaxf = svdup_n_f32(static_cast<float>(src_width - 1));
-  svfloat32_t ymaxf = svdup_n_f32(static_cast<float>(src_height - 1));
-
-  const size_t kStep = VecTraits<float>::num_lanes();
-
-  // auto get_coordinates = [&](svbool_t pg, size_t xs) {
-  auto coordinate_getter = [&](svbool_t pg, size_t xs) {
-    auto x = static_cast<ptrdiff_t>(xs);
-    return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]),
-                     svld1_f32(pg, &mapy_rows.as_columns()[x]));
-  };
-
-  auto calculate_linear = [&](svbool_t pg, uint32_t x) {
-    if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
-      svfloat32x2_t coords = coordinate_getter(pg, x);
-      return calculate_linear_replicated_border<ScalarType, IsLarge>(
-          pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows);
-    } else {
-      static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
-      svfloat32x2_t coords = coordinate_getter(pg, x);
-      return calculate_linear_constant_border<ScalarType, IsLarge>(
-          pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
-    }
-  };
-
-  for (size_t y = y_begin; y < y_end; ++y) {
-    Columns<ScalarType> dst = dst_rows.as_columns();
-    LoopUnroll2 loop{dst_width, kStep};
-    if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
-      remap32f_nearest<ScalarType, IsLarge, Border>(
-          sv_xmax, sv_ymax, sv_src_stride, src_rows, sv_border, dst, kStep,
-          dst_width, mapx_rows, mapy_rows);
-    } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) {
-      if constexpr (std::is_same<ScalarType, uint8_t>::value) {
-        loop.unroll_four_times([&](size_t x) {
-          ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
-          svuint32_t res0 = calculate_linear(pg_all32, x);
-          x += kStep;
-          svuint32_t res1 = calculate_linear(pg_all32, x);
-          svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0),
-                                             svreinterpret_u16_u32(res1));
-          x += kStep;
-          res0 = calculate_linear(pg_all32, x);
-          x += kStep;
-          res1 = calculate_linear(pg_all32, x);
-          svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0),
-                                             svreinterpret_u16_u32(res1));
-          svst1_u8(svptrue_b8(), p_dst,
-                   svuzp1_u8(svreinterpret_u8_u16(result16_0),
-                             svreinterpret_u8_u16(result16_1)));
-        });
-      } else if constexpr (std::is_same<ScalarType, uint16_t>::value) {
-        loop.unroll_twice([&](size_t x) {
-          ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
-          svuint32_t res0 = calculate_linear(pg_all32, x);
-          x += kStep;
-          svuint32_t res1 = calculate_linear(pg_all32, x);
-          svuint16_t result16 = svuzp1_u16(svreinterpret_u16_u32(res0),
-                                           svreinterpret_u16_u32(res1));
-          svst1_u16(svptrue_b16(), p_dst, result16);
-        });
-      }
-      loop.unroll_once([&](size_t x) {
-        ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
-        svuint32_t result = calculate_linear(pg_all32, x);
-        if constexpr (std::is_same<ScalarType, uint8_t>::value) {
-          svst1b_u32(pg_all32, p_dst, result);
-        }
-        if constexpr (std::is_same<ScalarType, uint16_t>::value) {
-          svst1h_u32(pg_all32, p_dst, result);
-        }
-      });
-      loop.remaining([&](size_t x, size_t x_max) {
-        ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
-        svbool_t pg32 = svwhilelt_b32(x, x_max);
-        svuint32_t result = calculate_linear(pg32, x);
-        if constexpr (std::is_same<ScalarType, uint8_t>::value) {
-          svst1b_u32(pg32, p_dst, result);
-        }
-        if constexpr (std::is_same<ScalarType, uint16_t>::value) {
-          svst1h_u32(pg32, p_dst, result);
-        }
-      });
-    } else {
-      static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST ||
-                        Inter == KLEIDICV_INTERPOLATION_LINEAR,
-                    ": Unknown interpolation type!");
-    }
-    ++mapx_rows;
-    ++mapy_rows;
-    ++dst_rows;
-  }
-}
 // NOLINTEND(readability-function-cognitive-complexity)
 
-// Most of the complexity comes from parameter checking.
-// NOLINTBEGIN(readability-function-cognitive-complexity)
-template <typename T>
-kleidicv_error_t remap_f32_sc(const T* src, size_t src_stride, size_t src_width,
-                              size_t src_height, T* dst, size_t dst_stride,
-                              size_t dst_width, size_t dst_height,
-                              size_t channels, const float* mapx,
-                              size_t mapx_stride, const float* mapy,
-                              size_t mapy_stride,
-                              kleidicv_interpolation_type_t interpolation,
-                              kleidicv_border_type_t border_type,
-                              [[maybe_unused]] const T* border_value) {
-  // may need to remove the maybe_unused
-  CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
-  CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
-  CHECK_POINTER_AND_STRIDE(mapx, mapx_stride, dst_height);
-  CHECK_POINTER_AND_STRIDE(mapy, mapy_stride, dst_height);
-  CHECK_IMAGE_SIZE(src_width, src_height);
-  CHECK_IMAGE_SIZE(dst_width, dst_height);
-  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
-    return KLEIDICV_ERROR_NULL_POINTER;
-  }
-
-  if (!remap_f32_is_implemented<T>(src_stride, src_width, src_height, dst_width,
-                                   border_type, channels, interpolation)) {
-    return KLEIDICV_ERROR_NOT_IMPLEMENTED;
-  }
-
-  // Calculating in float32_t will only be precise until 24 bits
-  if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
-      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24)) {
-    return KLEIDICV_ERROR_RANGE;
-  }
-
-  Rows<const T> src_rows{src, src_stride, channels};
-  Rows<const float> mapx_rows{mapx, mapx_stride, 1};
-  Rows<const float> mapy_rows{mapy, mapy_stride, 1};
-  Rows<T> dst_rows{dst, dst_stride, channels};
-  Rectangle rect{dst_width, dst_height};
-
-  remap32f_process_rows<T>(remap_image_is_large(src_rows, src_height),
-                           interpolation, border_type, src_rows, src_width,
-                           src_height, border_value, dst_rows, dst_width, 0,
-                           dst_height, mapx_rows, mapy_rows);
-
-  return KLEIDICV_OK;
-}
-// NOLINTEND(readability-function-cognitive-complexity)
+#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type)                    \
+  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5<type>(    \
+      const type* src, size_t src_stride, size_t src_width, size_t src_height, \
+      type* dst, size_t dst_stride, size_t dst_width, size_t dst_height,       \
+      size_t channels, const int16_t* mapxy, size_t mapxy_stride,              \
+      const uint16_t* mapfrac, size_t mapfrac_stride,                          \
+      kleidicv_border_type_t border_type, const type* border_value)
 
-}  // namespace KLEIDICV_TARGET_NAMESPACE
+KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t);
+KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t);
 
-#endif  // KLEIDICV_REMAP_SC_H
+}  // namespace kleidicv::sve2
diff --git a/kleidicv/src/transform/remap_sve2.cpp b/kleidicv/src/transform/remap_sve2.cpp
deleted file mode 100644
index 155b8d88c..000000000
--- a/kleidicv/src/transform/remap_sve2.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "remap_sc.h"
-
-namespace kleidicv::sve2 {
-
-template <typename T>
-kleidicv_error_t remap_s16(const T *src, size_t src_stride, size_t src_width,
-                           size_t src_height, T *dst, size_t dst_stride,
-                           size_t dst_width, size_t dst_height, size_t channels,
-                           const int16_t *mapxy, size_t mapxy_stride,
-                           kleidicv_border_type_t border_type,
-                           const T *border_value) {
-  return remap_s16_sc<T>(src, src_stride, src_width, src_height, dst,
-                         dst_stride, dst_width, dst_height, channels, mapxy,
-                         mapxy_stride, border_type, border_value);
-}
-
-template <typename T>
-kleidicv_error_t remap_s16point5(const T *src, size_t src_stride,
-                                 size_t src_width, size_t src_height, T *dst,
-                                 size_t dst_stride, size_t dst_width,
-                                 size_t dst_height, size_t channels,
-                                 const int16_t *mapxy, size_t mapxy_stride,
-                                 const uint16_t *mapfrac, size_t mapfrac_stride,
-                                 kleidicv_border_type_t border_type,
-                                 const T *border_value) {
-  return remap_s16point5_sc<T>(src, src_stride, src_width, src_height, dst,
-                               dst_stride, dst_width, dst_height, channels,
-                               mapxy, mapxy_stride, mapfrac, mapfrac_stride,
-                               border_type, border_value);
-}
-
-template <typename T>
-kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
-                           size_t src_height, T *dst, size_t dst_stride,
-                           size_t dst_width, size_t dst_height, size_t channels,
-                           const float *mapx, size_t mapx_stride,
-                           const float *mapy, size_t mapy_stride,
-                           kleidicv_interpolation_type_t interpolation,
-                           kleidicv_border_type_t border_type,
-                           const T *border_value) {
-  return remap_f32_sc<T>(src, src_stride, src_width, src_height, dst,
-                         dst_stride, dst_width, dst_height, channels, mapx,
-                         mapx_stride, mapy, mapy_stride, interpolation,
-                         border_type, border_value);
-}
-
-#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type)                          \
-  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16<type>(          \
-      const type *src, size_t src_stride, size_t src_width, size_t src_height, \
-      type *dst, size_t dst_stride, size_t dst_width, size_t dst_height,       \
-      size_t channels, const int16_t *mapxy, size_t mapxy_stride,              \
-      kleidicv_border_type_t border_type, const type *border_value)
-
-KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint8_t);
-KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint16_t);
-
-#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type)                    \
-  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5<type>(    \
-      const type *src, size_t src_stride, size_t src_width, size_t src_height, \
-      type *dst, size_t dst_stride, size_t dst_width, size_t dst_height,       \
-      size_t channels, const int16_t *mapxy, size_t mapxy_stride,              \
-      const uint16_t *mapfrac, size_t mapfrac_stride,                          \
-      kleidicv_border_type_t border_type, const type *border_value)
-
-KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t);
-KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t);
-
-#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(type)                          \
-  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_f32<type>(          \
-      const type *src, size_t src_stride, size_t src_width, size_t src_height, \
-      type *dst, size_t dst_stride, size_t dst_width, size_t dst_height,       \
-      size_t channels, const float *mapx, size_t mapx_stride,                  \
-      const float *mapy, size_t mapy_stride,                                   \
-      kleidicv_interpolation_type_t interpolation,                             \
-      kleidicv_border_type_t border_type, const type *border_value)
-
-KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t);
-KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint16_t);
-
-}  // namespace kleidicv::sve2
diff --git a/kleidicv/src/transform/transform_common.h b/kleidicv/src/transform/transform_common.h
new file mode 100644
index 000000000..eaf005f66
--- /dev/null
+++ b/kleidicv/src/transform/transform_common.h
@@ -0,0 +1,54 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <utility>
+
+#include "kleidicv/types.h"
+
+namespace KLEIDICV_TARGET_NAMESPACE {
+
+template <typename T>
+bool is_image_large(const Rows<T> &rows, size_t height) {
+  return rows.stride() * height >= 1ULL << 32;
+}
+
+// Convert border_type to a template argument.
+template <typename ScalarType, bool IsLarge,
+          kleidicv_interpolation_type_t Inter, typename... Args>
+void transform_operation(kleidicv_border_type_t border_type, Args &&...args) {
+  if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) {
+    transform_operation<ScalarType, IsLarge, Inter,
+                        KLEIDICV_BORDER_TYPE_REPLICATE>(
+        std::forward<Args>(args)...);
+  } else {
+    transform_operation<ScalarType, IsLarge, Inter,
+                        KLEIDICV_BORDER_TYPE_CONSTANT>(
+        std::forward<Args>(args)...);
+  }
+}
+
+// Convert interpolation_type to a template argument.
+template <typename ScalarType, bool IsLarge, typename... Args>
+void transform_operation(kleidicv_interpolation_type_t interpolation_type,
+                         Args &&...args) {
+  if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) {
+    transform_operation<ScalarType, IsLarge, KLEIDICV_INTERPOLATION_NEAREST>(
+        std::forward<Args>(args)...);
+  } else {
+    transform_operation<ScalarType, IsLarge, KLEIDICV_INTERPOLATION_LINEAR>(
+        std::forward<Args>(args)...);
+  }
+}
+
+// Convert is_large to a template argument.
+template <typename ScalarType, typename... Args>
+void transform_operation(bool is_large, Args &&...args) {
+  if (KLEIDICV_UNLIKELY(is_large)) {
+    transform_operation<ScalarType, true>(std::forward<Args>(args)...);
+  } else {
+    transform_operation<ScalarType, false>(std::forward<Args>(args)...);
+  }
+}
+
+}  // namespace KLEIDICV_TARGET_NAMESPACE
diff --git a/kleidicv/src/transform/transform_neon.h b/kleidicv/src/transform/transform_neon.h
new file mode 100644
index 000000000..afcc72da8
--- /dev/null
+++ b/kleidicv/src/transform/transform_neon.h
@@ -0,0 +1,256 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arm_neon.h>
+
+#include "kleidicv/neon.h"
+#include "kleidicv/types.h"
+#include "transform_common.h"
+
+namespace kleidicv::neon {
+
+typedef struct {
+  float32x4_t x, y;
+} FloatVectorPair;
+
+template <typename ScalarType, bool IsLarge>
+float32x4_t inline load_xy(uint32x4_t x, uint32x4_t y, uint32x4_t v_src_stride,
+                           Rows<const ScalarType>& src_rows) {
+  if constexpr (IsLarge) {
+    uint64x2_t offset_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                  vget_low_u32(v_src_stride));
+    uint64x2_t offset_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
+                                       vget_low_u32(v_src_stride));
+    uint64_t acc =
+        static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_low, 0)]) |
+        (static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_low, 1)]) << 32);
+    uint64x2_t rawsrc = vdupq_n_u64(acc);
+    acc =
+        static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_high, 0)]) |
+        (static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_high, 1)]) << 32);
+    rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
+    return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
+  } else {
+    uint32x4_t offset = vmlaq_u32(x, y, v_src_stride);
+    uint64_t acc =
+        static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 0)]) |
+        (static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 1)]) << 32);
+    uint64x2_t rawsrc = vdupq_n_u64(acc);
+    acc = static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 2)]) |
+          (static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 3)]) << 32);
+    rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
+    return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
+  }
+}
+
+template <typename ScalarType, bool IsLarge>
+float32x4_t inline load_xy_or_border(uint32x4_t x, uint32x4_t y,
+                                     uint32x4_t in_range,
+                                     ScalarType border_value,
+                                     uint32x4_t v_src_stride,
+                                     Rows<const ScalarType> src_rows) {
+  if constexpr (IsLarge) {
+    uint64x2_t offset_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                  vget_low_u32(v_src_stride));
+    uint64x2_t offset_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
+                                       vget_low_u32(v_src_stride));
+    uint64_t pixel0 = vgetq_lane_u32(in_range, 0)
+                          ? src_rows[vgetq_lane_u64(offset_low, 0)]
+                          : border_value;
+    uint64_t pixel1 = vgetq_lane_u32(in_range, 1)
+                          ? src_rows[vgetq_lane_u64(offset_low, 1)]
+                          : border_value;
+    uint64_t pixel2 = vgetq_lane_u32(in_range, 2)
+                          ? src_rows[vgetq_lane_u64(offset_high, 0)]
+                          : border_value;
+    uint64_t pixel3 = vgetq_lane_u32(in_range, 3)
+                          ? src_rows[vgetq_lane_u64(offset_high, 1)]
+                          : border_value;
+    uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
+    rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
+    rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1);
+    return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
+  } else {
+    uint32x4_t offset = vmlaq_u32(x, y, v_src_stride);
+    uint64_t pixel0 = vgetq_lane_u32(in_range, 0)
+                          ? src_rows[vgetq_lane_u32(offset, 0)]
+                          : border_value;
+    uint64_t pixel1 = vgetq_lane_u32(in_range, 1)
+                          ? src_rows[vgetq_lane_u32(offset, 1)]
+                          : border_value;
+    uint64_t pixel2 = vgetq_lane_u32(in_range, 2)
+                          ? src_rows[vgetq_lane_u32(offset, 2)]
+                          : border_value;
+    uint64_t pixel3 = vgetq_lane_u32(in_range, 3)
+                          ? src_rows[vgetq_lane_u32(offset, 3)]
+                          : border_value;
+    uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
+    rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
+    rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1);
+    return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
+  }
+}
+
+template <typename ScalarType, bool IsLarge>
+void load_quad_pixels_replicate(FloatVectorPair xy, uint32x4_t v_xmax,
+                                uint32x4_t v_ymax, uint32x4_t v_src_stride,
+                                Rows<const ScalarType> src_rows,
+                                float32x4_t& xfrac, float32x4_t& yfrac,
+                                float32x4_t& a, float32x4_t& b, float32x4_t& c,
+                                float32x4_t& d) {
+  auto&& [xf, yf] = xy;
+  // Truncating convert to int
+  uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax);
+  uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax);
+
+  // Get fractional part, or 0 if out of range
+  float32x4_t zero = vdupq_n_f32(0.F);
+  uint32x4_t x_in_range = vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax));
+  uint32x4_t y_in_range = vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax));
+  xfrac = vsubq_f32(xf, vrndmq_f32(xf));
+  yfrac = vsubq_f32(yf, vrndmq_f32(yf));
+
+  // x1 = x0 + 1, except if it's already xmax or out of range
+  uint32x4_t x1 = vsubq_u32(x0, x_in_range);
+  uint32x4_t y1 = vsubq_u32(y0, y_in_range);
+
+  // a: top left, b: top right, c: bottom left, d: bottom right
+  a = load_xy<ScalarType, IsLarge>(x0, y0, v_src_stride, src_rows);
+  b = load_xy<ScalarType, IsLarge>(x1, y0, v_src_stride, src_rows);
+  c = load_xy<ScalarType, IsLarge>(x0, y1, v_src_stride, src_rows);
+  d = load_xy<ScalarType, IsLarge>(x1, y1, v_src_stride, src_rows);
+}
+
+template <typename ScalarType, bool IsLarge>
+void load_quad_pixels_constant(FloatVectorPair xy, uint32x4_t v_xmax,
+                               uint32x4_t v_ymax, uint32x4_t v_src_stride,
+                               ScalarType border_value,
+                               Rows<const ScalarType> src_rows,
+                               float32x4_t& xfrac, float32x4_t& yfrac,
+                               float32x4_t& a, float32x4_t& b, float32x4_t& c,
+                               float32x4_t& d) {
+  auto&& [xf, yf] = xy;
+  // Convert coordinates to integers, truncating towards minus infinity.
+  // Negative numbers will become large positive numbers.
+  // Since the source width and height is known to be <=2^24 these large
+  // positive numbers will always be treated as outside the source image
+  // bounds.
+  uint32x4_t x0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(xf));
+  uint32x4_t y0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(yf));
+  uint32x4_t x1 = vaddq(x0, vdupq_n_u32(1));
+  uint32x4_t y1 = vaddq(y0, vdupq_n_u32(1));
+  xfrac = vsubq_f32(xf, vrndmq_f32(xf));
+  yfrac = vsubq_f32(yf, vrndmq_f32(yf));
+  uint32x4_t a_in_range, b_in_range, c_in_range, d_in_range;
+  {
+    uint32x4_t x0_in_range = vcleq_u32(x0, v_xmax);
+    uint32x4_t y0_in_range = vcleq_u32(y0, v_ymax);
+    uint32x4_t x1_in_range = vcleq_u32(x1, v_xmax);
+    uint32x4_t y1_in_range = vcleq_u32(y1, v_ymax);
+    a_in_range = vandq(x0_in_range, y0_in_range);
+    b_in_range = vandq(x1_in_range, y0_in_range);
+    c_in_range = vandq(x0_in_range, y1_in_range);
+    d_in_range = vandq(x1_in_range, y1_in_range);
+  }
+  a = load_xy_or_border<ScalarType, IsLarge>(x0, y0, a_in_range, border_value,
+                                             v_src_stride, src_rows);
+  b = load_xy_or_border<ScalarType, IsLarge>(x1, y0, b_in_range, border_value,
+                                             v_src_stride, src_rows);
+  c = load_xy_or_border<ScalarType, IsLarge>(x0, y1, c_in_range, border_value,
+                                             v_src_stride, src_rows);
+  d = load_xy_or_border<ScalarType, IsLarge>(x1, y1, d_in_range, border_value,
+                                             v_src_stride, src_rows);
+}
+
+inline uint32x4_t lerp_2d(float32x4_t xfrac, float32x4_t yfrac, float32x4_t a,
+                          float32x4_t b, float32x4_t c, float32x4_t d) {
+  float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
+  float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
+  float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
+  return vcvtaq_u32_f32(result);
+}
+
+template <typename ScalarType, bool IsLarge>
+void transform_pixels_replicate(float32x4_t xf, float32x4_t yf,
+                                uint32x4_t v_xmax, uint32x4_t v_ymax,
+                                uint32x4_t v_src_stride,
+                                Rows<const ScalarType> src_rows,
+                                Columns<ScalarType> dst) {
+  // Round to nearest, with Ties To Away (i.e. round 0.5 up)
+  // Clamp coordinates to within the dimensions of the source image
+  // (vcvtaq already converted negative values to 0)
+  uint32x4_t x = vminq_u32(vcvtaq_u32_f32(xf), v_xmax);
+  uint32x4_t y = vminq_u32(vcvtaq_u32_f32(yf), v_ymax);
+
+  // Copy pixels from source
+  if constexpr (IsLarge) {
+    uint64x2_t indices_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                  vget_low_u32(v_src_stride));
+    uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
+                                        vget_low_u32(v_src_stride));
+    dst[0] = src_rows[vgetq_lane_u64(indices_low, 0)];
+    dst[1] = src_rows[vgetq_lane_u64(indices_low, 1)];
+    dst[2] = src_rows[vgetq_lane_u64(indices_high, 0)];
+    dst[3] = src_rows[vgetq_lane_u64(indices_high, 1)];
+  } else {
+    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
+    dst[0] = src_rows[vgetq_lane_u32(indices, 0)];
+    dst[1] = src_rows[vgetq_lane_u32(indices, 1)];
+    dst[2] = src_rows[vgetq_lane_u32(indices, 2)];
+    dst[3] = src_rows[vgetq_lane_u32(indices, 3)];
+  }
+}
+
+template <typename ScalarType, bool IsLarge>
+void transform_pixels_constant(float32x4_t xf, float32x4_t yf,
+                               uint32x4_t v_xmax, uint32x4_t v_ymax,
+                               uint32x4_t v_src_stride,
+                               Rows<const ScalarType> src_rows,
+                               Columns<ScalarType> dst,
+                               ScalarType border_value) {
+  // Convert coordinates to integers.
+  // Negative numbers will become large positive numbers.
+  // Since the source width and height is known to be <=2^24 these large
+  // positive numbers will always be treated as outside the source image
+  // bounds.
+  uint32x4_t x = vreinterpretq_u32_s32(vcvtaq_s32_f32(xf));
+  uint32x4_t y = vreinterpretq_u32_s32(vcvtaq_s32_f32(yf));
+  uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax));
+
+  // Copy pixels from source
+  if constexpr (IsLarge) {
+    uint64x2_t indices_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                  vget_low_u32(v_src_stride));
+    uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
+                                        vget_low_u32(v_src_stride));
+    dst[0] = vgetq_lane_u32(in_range, 0)
+                 ? src_rows[vgetq_lane_u64(indices_low, 0)]
+                 : border_value;
+    dst[1] = vgetq_lane_u32(in_range, 1)
+                 ? src_rows[vgetq_lane_u64(indices_low, 1)]
+                 : border_value;
+    dst[2] = vgetq_lane_u32(in_range, 2)
+                 ? src_rows[vgetq_lane_u64(indices_high, 0)]
+                 : border_value;
+    dst[3] = vgetq_lane_u32(in_range, 3)
+                 ? src_rows[vgetq_lane_u64(indices_high, 1)]
+                 : border_value;
+  } else {
+    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
+    dst[0] = vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u32(indices, 0)]
+                                         : border_value;
+    dst[1] = vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u32(indices, 1)]
+                                         : border_value;
+    dst[2] = vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u32(indices, 2)]
+                                         : border_value;
+    dst[3] = vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u32(indices, 3)]
+                                         : border_value;
+  }
+}
+
+}  // namespace kleidicv::neon
diff --git a/kleidicv/src/transform/common_sc.h b/kleidicv/src/transform/transform_sve2.h
similarity index 60%
rename from kleidicv/src/transform/common_sc.h
rename to kleidicv/src/transform/transform_sve2.h
index 17ac56866..74b6d889a 100644
--- a/kleidicv/src/transform/common_sc.h
+++ b/kleidicv/src/transform/transform_sve2.h
@@ -18,56 +18,14 @@
 #include "kleidicv/sve2.h"
 #include "kleidicv/traits.h"
 #include "kleidicv/types.h"
+#include "transform_common.h"
 
-namespace KLEIDICV_TARGET_NAMESPACE {
-
-// Convert border_type to a template argument.
-template <typename ScalarType, bool IsLarge,
-          kleidicv_interpolation_type_t Inter, typename... Args>
-void remap32f_process_rows(kleidicv_border_type_t border_type, Args &&...args) {
-  if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) {
-    remap32f_process_rows<ScalarType, IsLarge, Inter,
-                          KLEIDICV_BORDER_TYPE_REPLICATE>(
-        std::forward<Args>(args)...);
-  } else {
-    remap32f_process_rows<ScalarType, IsLarge, Inter,
-                          KLEIDICV_BORDER_TYPE_CONSTANT>(
-        std::forward<Args>(args)...);
-  }
-}
-
-// Convert interpolation_type to a template argument.
-template <typename ScalarType, bool IsLarge, typename... Args>
-void remap32f_process_rows(kleidicv_interpolation_type_t interpolation_type,
-                           Args &&...args) {
-  if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) {
-    remap32f_process_rows<ScalarType, IsLarge, KLEIDICV_INTERPOLATION_NEAREST>(
-        std::forward<Args>(args)...);
-  } else {
-    remap32f_process_rows<ScalarType, IsLarge, KLEIDICV_INTERPOLATION_LINEAR>(
-        std::forward<Args>(args)...);
-  }
-}
-
-template <typename T>
-bool remap_image_is_large(const Rows<T> &rows, size_t height) {
-  return rows.stride() * height >= 1ULL << 32;
-}
-
-// Convert is_large to a template argument.
-template <typename ScalarType, typename... Args>
-void remap32f_process_rows(bool is_large, Args &&...args) {
-  if (KLEIDICV_UNLIKELY(is_large)) {
-    remap32f_process_rows<ScalarType, true>(std::forward<Args>(args)...);
-  } else {
-    remap32f_process_rows<ScalarType, false>(std::forward<Args>(args)...);
-  }
-}
+namespace kleidicv::sve2 {
 
 template <typename ScalarType, bool IsLarge>
-svuint32_t inline load_common(svbool_t pg, svuint32_t x, svuint32_t y,
-                              svuint32_t sv_src_stride,
-                              Rows<const ScalarType> &src_rows) {
+svuint32_t inline load_xy(svbool_t pg, svuint32_t x, svuint32_t y,
+                          svuint32_t sv_src_stride,
+                          Rows<const ScalarType> &src_rows) {
   if constexpr (std::is_same<ScalarType, uint8_t>::value) {
     if constexpr (IsLarge) {
       svbool_t pg_b = pg;
@@ -115,7 +73,7 @@ svuint32_t inline calculate_linear_replicated_border(
     svbool_t pg, svfloat32x2_t coords, svfloat32_t xmaxf, svfloat32_t ymaxf,
     svuint32_t sv_src_stride, Rows<const ScalarType> &src_rows) {
   auto load_source = [&](svuint32_t x, svuint32_t y) {
-    return load_common<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows);
+    return load_xy<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows);
   };
   svbool_t pg_all32 = svptrue_b32();
   svfloat32_t xf = svget2(coords, 0);
@@ -164,7 +122,7 @@ svuint32_t get_pixels_or_border(svbool_t pg, svuint32_t x, svuint32_t y,
   svbool_t in_range =
       svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
   svuint32_t result =
-      load_common<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows);
+      load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows);
   // Select between source pixels and border colour
   return svsel_u32(in_range, result, sv_border);
 }
@@ -174,48 +132,44 @@ svuint32_t inline calculate_linear_constant_border(
     svbool_t pg, svfloat32x2_t coords, svuint32_t sv_border, svuint32_t sv_xmax,
     svuint32_t sv_ymax, svuint32_t sv_src_stride,
     Rows<const ScalarType> &src_rows) {
-  svfloat32_t xf = svget2(coords, 0);
-  svfloat32_t yf = svget2(coords, 1);
-
-  // Convert obviously out-of-range coordinates to values that are just beyond
-  // the largest permitted image width & height. This avoids the need for
-  // special case handling elsewhere.
-  svfloat32_t big = svdup_n_f32(1 << 24);
-  xf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, xf), big), xf, big);
-  yf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, yf), big), yf, big);
-
-  svfloat32_t xf0 = svrintm_f32_x(pg, xf);
-  svfloat32_t yf0 = svrintm_f32_x(pg, yf);
-
-  svint32_t x0 = svcvt_s32_x(pg, xf0);
-  svint32_t y0 = svcvt_s32_x(pg, yf0);
-  svint32_t x1 = svadd_s32_x(pg, x0, svdup_n_s32(1));
-  svint32_t y1 = svadd_s32_x(pg, y0, svdup_n_s32(1));
-
-  svfloat32_t xfrac = svsub_f32_x(pg, xf, xf0);
-  svfloat32_t yfrac = svsub_f32_x(pg, yf, yf0);
+  // Convert coordinates to integers, truncating towards minus infinity.
+  // Negative numbers will become large positive numbers.
+  // Since the source width and height is known to be <=2^24 these large
+  // positive numbers will always be treated as outside the source image
+  // bounds.
+  svuint32_t x0, y0, x1, y1;
+  svfloat32_t xfrac, yfrac;
+  {
+    svfloat32_t xf = svget2(coords, 0);
+    svfloat32_t yf = svget2(coords, 1);
+    svfloat32_t xf0 = svrintm_f32_x(pg, xf);
+    svfloat32_t yf0 = svrintm_f32_x(pg, yf);
+    x0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, xf0));
+    y0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, yf0));
+    x1 = svadd_u32_x(pg, x0, svdup_n_u32(1));
+    y1 = svadd_u32_x(pg, y0, svdup_n_u32(1));
+
+    xfrac = svsub_f32_x(pg, xf, xf0);
+    yfrac = svsub_f32_x(pg, yf, yf0);
+  }
 
-  svfloat32_t a = svcvt_f32_u32_x(
-      pg, get_pixels_or_border<ScalarType, IsLarge>(
-              pg, svreinterpret_u32_s32(x0), svreinterpret_u32_s32(y0),
-              sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows));
-  svfloat32_t b = svcvt_f32_u32_x(
-      pg, get_pixels_or_border<ScalarType, IsLarge>(
-              pg, svreinterpret_u32_s32(x1), svreinterpret_u32_s32(y0),
-              sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows));
+  svfloat32_t a = svcvt_f32_u32_x(pg, get_pixels_or_border<ScalarType, IsLarge>(
+                                          pg, x0, y0, sv_border, sv_xmax,
+                                          sv_ymax, sv_src_stride, src_rows));
+  svfloat32_t b = svcvt_f32_u32_x(pg, get_pixels_or_border<ScalarType, IsLarge>(
+                                          pg, x1, y0, sv_border, sv_xmax,
+                                          sv_ymax, sv_src_stride, src_rows));
   svfloat32_t line0 = svmla_f32_x(pg, a, svsub_f32_x(pg, b, a), xfrac);
-  svfloat32_t c = svcvt_f32_u32_x(
-      pg, get_pixels_or_border<ScalarType, IsLarge>(
-              pg, svreinterpret_u32_s32(x0), svreinterpret_u32_s32(y1),
-              sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows));
-  svfloat32_t d = svcvt_f32_u32_x(
-      pg, get_pixels_or_border<ScalarType, IsLarge>(
-              pg, svreinterpret_u32_s32(x1), svreinterpret_u32_s32(y1),
-              sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows));
+  svfloat32_t c = svcvt_f32_u32_x(pg, get_pixels_or_border<ScalarType, IsLarge>(
+                                          pg, x0, y1, sv_border, sv_xmax,
+                                          sv_ymax, sv_src_stride, src_rows));
+  svfloat32_t d = svcvt_f32_u32_x(pg, get_pixels_or_border<ScalarType, IsLarge>(
+                                          pg, x1, y1, sv_border, sv_xmax,
+                                          sv_ymax, sv_src_stride, src_rows));
   svfloat32_t line1 = svmla_f32_x(pg, c, svsub_f32_x(pg, d, c), xfrac);
   svfloat32_t result =
       svmla_f32_x(pg, line0, svsub_f32_x(pg, line1, line0), yfrac);
   return svcvt_u32_f32_x(pg, svrinta_f32_x(pg, result));
 }
 
-}  // namespace KLEIDICV_TARGET_NAMESPACE
+}  // namespace kleidicv::sve2
diff --git a/kleidicv/src/transform/warp_perspective_neon.cpp b/kleidicv/src/transform/warp_perspective_neon.cpp
index c42bc0ef2..dd7494523 100644
--- a/kleidicv/src/transform/warp_perspective_neon.cpp
+++ b/kleidicv/src/transform/warp_perspective_neon.cpp
@@ -7,11 +7,11 @@
 #include <cassert>
 
 #include "kleidicv/ctypes.h"
-#include "kleidicv/kleidicv.h"
 #include "kleidicv/neon.h"
 #include "kleidicv/traits.h"
 #include "kleidicv/transform/warp_perspective.h"
 #include "kleidicv/utils.h"
+#include "transform_neon.h"
 
 namespace kleidicv::neon {
 
@@ -33,161 +33,17 @@ namespace kleidicv::neon {
 //      yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8)
 //
 
-namespace {
-
-typedef struct {
-  float32x4_t x, y;
-} CoordVectorPair;
-
-template <typename ScalarType>
-inline uint32x4_t get_pixels_or_border_large(uint32x4_t x, uint32x4_t y,
-                                             uint32x4_t v_xmax,
-                                             uint32x4_t v_ymax,
-                                             uint32x2_t v_src_stride,
-                                             Rows<const ScalarType> src_rows,
-                                             const ScalarType *border_value) {
-  uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax));
-  // Calculate offsets from coordinates (y * stride + x)
-  // To avoid losing precision, the final offsets should be in 64 bits
-  uint64x2_t offsets_low =
-      vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), v_src_stride);
-  uint64x2_t offsets_high =
-      vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_stride);
-  // Copy pixels from source
-  uint32_t pixel_data[4] = {
-      vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u64(offsets_low, 0)]
-                                  : border_value[0],
-      vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u64(offsets_low, 1)]
-                                  : border_value[0],
-      vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u64(offsets_high, 0)]
-                                  : border_value[0],
-      vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u64(offsets_high, 1)]
-                                  : border_value[0],
-  };
-  return vld1q_u32(pixel_data);
-}
-
-template <typename ScalarType>
-inline uint32x4_t get_pixels_or_border_small(uint32x4_t x, uint32x4_t y,
-                                             uint32x4_t v_xmax,
-                                             uint32x4_t v_ymax,
-                                             uint32x4_t v_src_stride,
-                                             Rows<const ScalarType> src_rows,
-                                             const ScalarType *border_value) {
-  uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax));
-  // Calculate offsets from coordinates (y * stride + x)
-  // Use this path only when the final offsets fit into 32 bits
-  uint32x4_t offsets = vmlaq_u32(x, y, v_src_stride);
-  // Copy pixels from source
-  uint32_t pixel_data[4] = {
-      vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u32(offsets, 0)]
-                                  : border_value[0],
-      vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u32(offsets, 1)]
-                                  : border_value[0],
-      vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u32(offsets, 2)]
-                                  : border_value[0],
-      vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u32(offsets, 3)]
-                                  : border_value[0],
-  };
-  return vld1q_u32(pixel_data);
-}
-
-template <typename ScalarType>
-inline void get_edge_pixels(unsigned &a_result, unsigned &b_result,
-                            unsigned &c_result, unsigned &d_result, int x0,
-                            int y0, ptrdiff_t offset,
-                            Rows<const ScalarType> src_rows, int src_width,
-                            int src_height) {
-  if (y0 >= 0) {
-    if (x0 >= 0) {
-      a_result = src_rows[offset];
-    }
-    if (x0 + 1 < src_width) {
-      b_result = src_rows[offset + 1];
-    }
-  }
-  if (y0 + 1 < src_height) {
-    offset += src_rows.stride();
-    if (x0 >= 0) {
-      c_result = src_rows[offset];
-    }
-    if (x0 + 1 < src_width) {
-      d_result = src_rows[offset + 1];
-    }
-  }
-}
-
-template <typename ScalarType>
-inline uint32x4_t calculate_linear_constant_border(
-    float32x4_t xf, float32x4_t yf, Rows<const ScalarType> src_rows,
-    int src_width, int src_height, const ScalarType *border_value) {
-  // Convert obviously out-of-range coordinates to values that are just beyond
-  // the largest permitted image width & height. This avoids the need for
-  // special case handling elsewhere.
-  float32x4_t big = vdupq_n_f32(1 << 24);
-  xf = vbslq_f32(vcleq_f32(vabsq_f32(xf), big), xf, big);
-  yf = vbslq_f32(vcleq_f32(vabsq_f32(yf), big), yf, big);
-
-  int32x4_t x0 = vcvtmq_s32_f32(xf);
-  int32x4_t y0 = vcvtmq_s32_f32(yf);
-  int x0_array[4], y0_array[4];
-  unsigned a_array[4], b_array[4], c_array[4], d_array[4];
-  vst1q_s32(x0_array, x0);
-  vst1q_s32(y0_array, y0);
-  for (int i = 0; i < 4; ++i) {
-    int x0i = x0_array[i];
-    int y0i = y0_array[i];
-    ptrdiff_t offset = x0i + y0i * src_rows.stride();
-
-    if (x0i < 0 || x0i + 1 >= src_width || y0i < 0 || y0i + 1 >= src_height) {
-      // Not entirely within the source image
-
-      a_array[i] = b_array[i] = c_array[i] = d_array[i] = border_value[0];
-
-      if (x0i < -1 || x0i >= src_width || y0i < -1 || y0i >= src_height) {
-        // Completely outside the source image
-        continue;
-      }
-
-      get_edge_pixels(a_array[i], b_array[i], c_array[i], d_array[i], x0i, y0i,
-                      offset, src_rows, src_width, src_height);
-      continue;
-    }
-
-    // Completely inside the source image
-    a_array[i] = src_rows[offset];
-    b_array[i] = src_rows[offset + 1];
-    offset += src_rows.stride();
-    c_array[i] = src_rows[offset];
-    d_array[i] = src_rows[offset + 1];
-  }
-
-  float32x4_t xfrac = vsubq_f32(xf, vrndmq_f32(xf));
-  float32x4_t yfrac = vsubq_f32(yf, vrndmq_f32(yf));
-  float32x4_t a = vcvtq_f32_u32(vld1q_u32(a_array));
-  float32x4_t b = vcvtq_f32_u32(vld1q_u32(b_array));
-  float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
-  float32x4_t c = vcvtq_f32_u32(vld1q_u32(c_array));
-  float32x4_t d = vcvtq_f32_u32(vld1q_u32(d_array));
-  float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
-  float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
-  return vcvtaq_u32_f32(result);
-}
-
 template <typename ScalarType, bool IsLarge,
           kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
-void warp_perspective_operation(Rows<const ScalarType> src_rows,
-                                size_t src_width, size_t src_height,
-                                const float transform[9],
-                                const ScalarType *border_value,
-                                Rows<ScalarType> dst_rows, size_t dst_width,
-                                size_t y_begin, size_t y_end) {
+void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
+                         size_t src_height, const float transform[9],
+                         const ScalarType *border_value,
+                         Rows<ScalarType> dst_rows, size_t dst_width,
+                         size_t y_begin, size_t y_end) {
   static constexpr uint32_t first_few_x[] = {0, 1, 2, 3};
   uint32x4_t x0123_ = vld1q_u32(first_few_x);
 
-  uint32x2_t v_src_stride =
-      vdup_n_u32(static_cast<uint32_t>(src_rows.stride()));
-  uint32x4_t vq_src_stride =
+  uint32x4_t v_src_stride =
       vdupq_n_u32(static_cast<uint32_t>(src_rows.stride()));
   uint32x4_t v_xmax = vdupq_n_u32(static_cast<uint32_t>(src_width - 1));
   uint32x4_t v_ymax = vdupq_n_u32(static_cast<uint32_t>(src_height - 1));
@@ -205,154 +61,27 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     // Calculate coordinates into the source image
     float32x4_t xf = vmulq_f32(tx, iw);
     float32x4_t yf = vmulq_f32(ty, iw);
-    return CoordVectorPair{xf, yf};
-  };
-
-  auto vector_path_nearest_small = [&](uint32_t x, Columns<ScalarType> dst) {
-    auto &&[xf, yf] = calculate_coordinates(x);
-    // Take the integer part, clamp it to within the dimensions of the
-    // source image (negative values are already saturated to 0)
-    uint32x4_t xi = vminq_u32(v_xmax, vcvtaq_u32_f32(xf));
-    uint32x4_t yi = vminq_u32(v_ymax, vcvtaq_u32_f32(yf));
-    // Calculate offsets from coordinates (y * stride + x)
-    // Use this path only when the final offsets fit into 32 bits
-    uint32x4_t offsets = vmlaq_u32(xi, yi, vq_src_stride);
-    // Copy pixels from source
-    ptrdiff_t ix = static_cast<ptrdiff_t>(x);
-    dst[ix] = src_rows[vgetq_lane_u32(offsets, 0)];
-    dst[ix + 1] = src_rows[vgetq_lane_u32(offsets, 1)];
-    dst[ix + 2] = src_rows[vgetq_lane_u32(offsets, 2)];
-    dst[ix + 3] = src_rows[vgetq_lane_u32(offsets, 3)];
-  };
-
-  auto vector_path_nearest_large = [&](uint32_t x, Columns<ScalarType> dst) {
-    auto &&[xf, yf] = calculate_coordinates(x);
-    // Take the integer part, clamp it to within the dimensions of the
-    // source image (negative values are already saturated to 0)
-    uint32x4_t xi = vminq_u32(v_xmax, vcvtaq_u32_f32(xf));
-    uint32x4_t yi = vminq_u32(v_ymax, vcvtaq_u32_f32(yf));
-    // Calculate offsets from coordinates (y * stride + x)
-    // To avoid losing precision, the final offsets should be in 64 bits
-    uint64x2_t offsets_low =
-        vmlal_u32(vmovl_u32(vget_low_u32(xi)), vget_low_u32(yi), v_src_stride);
-    uint64x2_t offsets_high =
-        vmlal_u32(vmovl_high_u32(xi), vget_high_u32(yi), v_src_stride);
-    // Copy pixels from source
-    ptrdiff_t ix = static_cast<ptrdiff_t>(x);
-    dst[ix] = src_rows[vgetq_lane_u64(offsets_low, 0)];
-    dst[ix + 1] = src_rows[vgetq_lane_u64(offsets_low, 1)];
-    dst[ix + 2] = src_rows[vgetq_lane_u64(offsets_high, 0)];
-    dst[ix + 3] = src_rows[vgetq_lane_u64(offsets_high, 1)];
-  };
-
-  auto vector_path_nearest_constant_border = [&](uint32_t x,
-                                                 Columns<ScalarType> dst) {
-    auto &&[xf, yf] = calculate_coordinates(x);
-    // Convert coordinates to integers.
-    // Negative numbers will become large positive numbers.
-    // Since the source width and height is known to be <=2^24 these large
-    // positive numbers will always be treated as outside the source image
-    // bounds.
-    uint32x4_t xi = vreinterpretq_u32_s32(vcvtaq_s32_f32(xf));
-    uint32x4_t yi = vreinterpretq_u32_s32(vcvtaq_s32_f32(yf));
-    uint32x4_t pixels;
-    if constexpr (IsLarge) {
-      pixels = get_pixels_or_border_large(xi, yi, v_xmax, v_ymax, v_src_stride,
-                                          src_rows, border_value);
-    } else {
-      pixels = get_pixels_or_border_small(xi, yi, v_xmax, v_ymax, vq_src_stride,
-                                          src_rows, border_value);
-    }
-
-    ptrdiff_t ix = static_cast<ptrdiff_t>(x);
-    dst[ix] = static_cast<ScalarType>(vgetq_lane_u32(pixels, 0));
-    dst[ix + 1] = static_cast<ScalarType>(vgetq_lane_u32(pixels, 1));
-    dst[ix + 2] = static_cast<ScalarType>(vgetq_lane_u32(pixels, 2));
-    dst[ix + 3] = static_cast<ScalarType>(vgetq_lane_u32(pixels, 3));
-  };
-
-  auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) {
-    uint32x4_t offset = vmlaq_u32(x, y, vq_src_stride);
-    uint64_t acc =
-        static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 0)]) |
-        (static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 1)]) << 32);
-    uint64x2_t rawsrc = vdupq_n_u64(acc);
-    acc = static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 2)]) |
-          (static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 3)]) << 32);
-    rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
-    return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
-  };
-
-  auto load_src_into_floats_large = [&](uint32x4_t x, uint32x4_t y) {
-    uint64x2_t offset_low =
-        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), v_src_stride);
-    uint64x2_t offset_high =
-        vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_stride);
-    uint64_t acc =
-        static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_low, 0)]) |
-        (static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_low, 1)]) << 32);
-    uint64x2_t rawsrc = vdupq_n_u64(acc);
-    acc =
-        static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_high, 0)]) |
-        (static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_high, 1)]) << 32);
-    rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
-    return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
-  };
-
-  auto calculate_linear_replicated_border = [&](uint32_t x) {
-    auto load_floats = [&](uint32x4_t x, uint32x4_t y) {
-      if constexpr (IsLarge) {
-        return load_src_into_floats_large(x, y);
-      } else {
-        return load_src_into_floats_small(x, y);
-      }
-    };
-    auto &&[xf, yf] = calculate_coordinates(x);
-    // Truncating convert to int
-    uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax);
-    uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax);
-
-    // Get fractional part, or 0 if out of range
-    float32x4_t zero = vdupq_n_f32(0.F);
-    uint32x4_t x_in_range =
-        vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax));
-    uint32x4_t y_in_range =
-        vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax));
-    float32x4_t xfrac =
-        vbslq_f32(x_in_range, vsubq_f32(xf, vrndmq_f32(xf)), zero);
-    float32x4_t yfrac =
-        vbslq_f32(y_in_range, vsubq_f32(yf, vrndmq_f32(yf)), zero);
-
-    // x1 = x0 + 1, except if it's already xmax or out of range
-    uint32x4_t x1 = vsubq_u32(x0, x_in_range);
-    uint32x4_t y1 = vsubq_u32(y0, y_in_range);
-
-    // Calculate offsets from coordinates (y * stride + x)
-    // a: top left, b: top right, c: bottom left, d: bottom right
-    float32x4_t a = load_floats(x0, y0);
-    float32x4_t b = load_floats(x1, y0);
-    float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
-    float32x4_t c = load_floats(x0, y1);
-    float32x4_t d = load_floats(x1, y1);
-    float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
-    float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
-    return vminq_u32(vdupq_n_u32(0xFF), vcvtaq_u32_f32(result));
+    return FloatVectorPair{xf, yf};
   };
 
   auto calculate_linear = [&](uint32_t x) {
+    float32x4_t a, b, c, d, xfrac, yfrac;
     if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
-      return calculate_linear_replicated_border(x);
+      load_quad_pixels_replicate<ScalarType, IsLarge>(
+          calculate_coordinates(x), v_xmax, v_ymax, v_src_stride, src_rows,
+          xfrac, yfrac, a, b, c, d);
     } else {
       static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
-      auto &&[xf, yf] = calculate_coordinates(x);
-      return calculate_linear_constant_border(
-          xf, yf, src_rows, static_cast<int>(src_width),
-          static_cast<int>(src_height), border_value);
+      load_quad_pixels_constant<ScalarType, IsLarge>(
+          calculate_coordinates(x), v_xmax, v_ymax, v_src_stride,
+          border_value[0], src_rows, xfrac, yfrac, a, b, c, d);
     }
+    return lerp_2d(xfrac, yfrac, a, b, c, d);
   };
 
-  auto process_row = [&](uint32_t y, Columns<ScalarType> dst) {
+  for (size_t y = y_begin; y < y_end; ++y) {
     float dy = static_cast<float>(y);
+    Columns<ScalarType> dst = dst_rows.as_columns();
     // Calculate half-transformed values at the first pixel (nominators)
     // tw =  T6*x + T7*y + T8
     // tx = (T0*x + T1*y + T2) / tw
@@ -364,20 +93,23 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     static const size_t kStep = VecTraits<float>::num_lanes();
     LoopUnroll2<TryToAvoidTailLoop> loop{dst_width, kStep};
     if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
-      if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+      if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
         loop.unroll_once([&](size_t x) {
-          vector_path_nearest_constant_border(static_cast<uint32_t>(x), dst);
-        });
-      } else if constexpr (IsLarge) {
-        loop.unroll_once([&](size_t x) {
-          vector_path_nearest_large(static_cast<uint32_t>(x), dst);
+          auto &&[xf, yf] = calculate_coordinates(x);
+          transform_pixels_replicate<ScalarType, IsLarge>(
+              xf, yf, v_xmax, v_ymax, v_src_stride, src_rows, dst.at(x));
         });
       } else {
+        static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
         loop.unroll_once([&](size_t x) {
-          vector_path_nearest_small(static_cast<uint32_t>(x), dst);
+          auto &&[xf, yf] = calculate_coordinates(x);
+          transform_pixels_constant<ScalarType, IsLarge>(
+              xf, yf, v_xmax, v_ymax, v_src_stride, src_rows, dst.at(x),
+              border_value[0]);
         });
       }
-    } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) {
+    } else {
+      static_assert(Inter == KLEIDICV_INTERPOLATION_LINEAR);
       loop.unroll_four_times([&](size_t _x) {
         uint32_t x = static_cast<uint32_t>(_x);
         ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(_x)];
@@ -400,54 +132,11 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
         p_dst[2] = vgetq_lane_u32(res, 2);
         p_dst[3] = vgetq_lane_u32(res, 3);
       });
-    } else {
-      static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST ||
-                        Inter == KLEIDICV_INTERPOLATION_LINEAR,
-                    ": Unknown interpolation type!");
     }
-  };
-
-  for (size_t y = y_begin; y < y_end; ++y) {
-    process_row(y, dst_rows.as_columns());
     ++dst_rows;
   }
 }
 
-// Convert border_type to a template argument.
-template <typename ScalarType, bool IsLarge,
-          kleidicv_interpolation_type_t Inter, typename... Args>
-void warp_perspective_operation(kleidicv_border_type_t border_type,
-                                Args &&...args) {
-  if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) {
-    warp_perspective_operation<ScalarType, IsLarge, Inter,
-                               KLEIDICV_BORDER_TYPE_REPLICATE>(
-        std::forward<Args>(args)...);
-  } else {
-    warp_perspective_operation<ScalarType, IsLarge, Inter,
-                               KLEIDICV_BORDER_TYPE_CONSTANT>(
-        std::forward<Args>(args)...);
-  }
-}
-
-// Convert interpolation_type to a template argument.
-template <typename ScalarType, bool IsLarge, typename... Args>
-void warp_perspective_operation(
-    kleidicv_interpolation_type_t interpolation_type, Args &&...args) {
-  if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) {
-    warp_perspective_operation<ScalarType, IsLarge,
-                               KLEIDICV_INTERPOLATION_NEAREST>(
-        std::forward<Args>(args)...);
-  } else {
-    warp_perspective_operation<ScalarType, IsLarge,
-                               KLEIDICV_INTERPOLATION_LINEAR>(
-        std::forward<Args>(args)...);
-  }
-}
-
-}  // namespace
-
-// Most of the complexity comes from parameter checking.
-// NOLINTBEGIN(readability-function-cognitive-complexity)
 template <typename T>
 kleidicv_error_t warp_perspective_stripe(
     const T *src, size_t src_stride, size_t src_width, size_t src_height,
@@ -467,9 +156,10 @@ kleidicv_error_t warp_perspective_stripe(
 
   // Calculating in float32_t will only be precise until 24 bits, and
   // multiplication can only be done with 32x32 bits
+  // Empty source image is not supported
   if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
       dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) ||
-      src_stride >= (1ULL << 32)) {
+      src_stride >= (1ULL << 32) || src_width == 0 || src_height == 0) {
     return KLEIDICV_ERROR_RANGE;
   }
 
@@ -477,16 +167,10 @@ kleidicv_error_t warp_perspective_stripe(
   Rows<T> dst_rows{dst, dst_stride, channels};
   dst_rows += y_begin;
 
-  if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
-    warp_perspective_operation<T, true>(
-        interpolation, border_type, src_rows, src_width, src_height,
-        transformation, border_value, dst_rows, dst_width, y_begin, y_end);
-  } else {
-    warp_perspective_operation<T, false>(
-        interpolation, border_type, src_rows, src_width, src_height,
-        transformation, border_value, dst_rows, dst_width, y_begin, y_end);
-  }
-  // NOLINTEND(readability-function-cognitive-complexity)
+  transform_operation<T>(is_image_large(src_rows, src_height), interpolation,
+                         border_type, src_rows, src_width, src_height,
+                         transformation, border_value, dst_rows, dst_width,
+                         y_begin, y_end);
   return KLEIDICV_OK;
 }
 
diff --git a/kleidicv/src/transform/warp_perspective_sc.h b/kleidicv/src/transform/warp_perspective_sc.h
deleted file mode 100644
index d75958b86..000000000
--- a/kleidicv/src/transform/warp_perspective_sc.h
+++ /dev/null
@@ -1,299 +0,0 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <arm_sve.h>
-
-#include <cassert>
-#include <cmath>
-#include <utility>
-
-#include "common_sc.h"
-#include "kleidicv/ctypes.h"
-#include "kleidicv/kleidicv.h"
-#include "kleidicv/sve2.h"
-#include "kleidicv/traits.h"
-#include "kleidicv/transform/warp_perspective.h"
-#include "kleidicv/types.h"
-
-namespace KLEIDICV_TARGET_NAMESPACE {
-
-// Gather load is not available in streaming mode, and in general random access
-// is not recommended for SME.
-#if !KLEIDICV_TARGET_SME2
-
-// Template for WarpPerspective transformation.
-// Destination pixels are filled from the source, by taking pixels using the
-// transformed coordinates that are calculated as follows:
-//
-//                    [ T0, T1, T2 ]   [ x ]
-//      (x',y',w') =  [ T3, T4, T5 ] * [ y ]
-//                    [ T6, T7, T8 ]   [ 1 ]
-//  then
-//
-//      xt = x' / w'
-//      yt = y' / w'
-//
-//  or putting it together:
-//
-//      xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8)
-//      yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8)
-//
-
-template <typename ScalarType, bool IsLarge,
-          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
-void remap32f_process_rows(Rows<const ScalarType> src_rows, size_t src_width,
-                           size_t src_height, const ScalarType *border_value,
-                           Rows<ScalarType> dst_rows, size_t dst_width,
-                           size_t y_begin, size_t y_end,
-                           const float transform[9]) {
-  svbool_t pg_all32 = svptrue_b32();
-  svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
-  svuint32_t sv_ymax = svdup_n_u32(src_height - 1);
-  svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride());
-  svuint32_t sv_border;
-  // sv_border is only used if the border type is constant.
-  // If the border type is not constant then border_value is permitted to be
-  // null and must not be read.
-  if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
-    sv_border = svdup_n_u32(border_value[0]);
-  }
-
-  svfloat32_t xmaxf = svdup_n_f32(static_cast<float>(src_width - 1));
-  svfloat32_t ymaxf = svdup_n_f32(static_cast<float>(src_height - 1));
-
-  const size_t kStep = VecTraits<float>::num_lanes();
-
-  svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1));
-  svfloat32_t T0 = svdup_n_f32(transform[0]);
-  svfloat32_t T3 = svdup_n_f32(transform[3]);
-  svfloat32_t T6 = svdup_n_f32(transform[6]);
-  svfloat32_t tx0, ty0, tw0;
-
-  auto coordinate_getter = [&](svbool_t, size_t x) {
-    svfloat32_t vx = svadd_n_f32_x(pg_all32, sv_0123, static_cast<float>(x));
-    // Calculate half-transformed values from the first few pixel values,
-    // plus Tn*x, similarly to the one above
-    // Calculate inverse weight because division is expensive
-    svfloat32_t iw =
-        svdiv_f32_x(pg_all32, svdup_n_f32(1.F), svmla_x(pg_all32, tw0, vx, T6));
-    svfloat32_t tx = svmla_x(pg_all32, tx0, vx, T0);
-    svfloat32_t ty = svmla_x(pg_all32, ty0, vx, T3);
-
-    // Calculate coordinates into the source image
-    return svcreate2(svmul_f32_x(pg_all32, tx, iw),
-                     svmul_f32_x(pg_all32, ty, iw));
-  };
-
-  auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) {
-    svfloat32x2_t coords = coordinate_getter(pg32, x);
-    svfloat32_t xf = svget2(coords, 0);
-    svfloat32_t yf = svget2(coords, 1);
-
-    svuint32_t xi, yi;
-    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
-      // Round to the nearest integer
-      xi = svreinterpret_u32_s32(
-          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf)));
-      yi = svreinterpret_u32_s32(
-          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf)));
-    } else {
-      // Round to the nearest integer, clamp it to within the dimensions of the
-      // source image (negative values are already saturated to 0)
-      xi = svmin_x(pg_all32,
-                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)),
-                   sv_xmax);
-      yi = svmin_x(pg_all32,
-                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)),
-                   sv_ymax);
-    }
-    return svcreate2(xi, yi);
-  };
-
-  auto get_pixels_or_border = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
-    svbool_t in_range =
-        svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
-    svuint32_t result = load_common<ScalarType, IsLarge>(
-        in_range, x, y, sv_src_stride, src_rows);
-    // Select between source pixels and border colour
-    return svsel_u32(in_range, result, sv_border);
-  };
-
-  auto vector_path_nearest_4x = [&](size_t x, Columns<ScalarType> dst) {
-    auto load_source = [&](svuint32x2_t coords) {
-      svuint32_t x = svget2(coords, 0);
-      svuint32_t y = svget2(coords, 1);
-      if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
-        return get_pixels_or_border(pg_all32, x, y);
-      } else {
-        return load_common<ScalarType, IsLarge>(pg_all32, x, y, sv_src_stride,
-                                                src_rows);
-      }
-    };
-    ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
-    svuint32_t res32_0 =
-        load_source(calculate_nearest_coordinates(pg_all32, x));
-    x += kStep;
-    svuint32_t res32_1 =
-        load_source(calculate_nearest_coordinates(pg_all32, x));
-    svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                    svreinterpret_u16_u32(res32_1));
-    x += kStep;
-    res32_0 = load_source(calculate_nearest_coordinates(pg_all32, x));
-    x += kStep;
-    res32_1 = load_source(calculate_nearest_coordinates(pg_all32, x));
-    svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                    svreinterpret_u16_u32(res32_1));
-    svuint8_t result =
-        svuzp1_u8(svreinterpret_u8_u16(result0), svreinterpret_u8_u16(result1));
-    svst1(svptrue_b8(), p_dst, result);
-  };
-
-  auto vector_path_nearest_tail = [&](size_t x, size_t x_max,
-                                      Columns<ScalarType> dst) {
-    size_t length = x_max - x;
-    svbool_t pg32 = svwhilelt_b32(0ULL, length);
-
-    svuint32x2_t coords = calculate_nearest_coordinates(pg32, x);
-    svuint32_t xi = svget2(coords, 0);
-    svuint32_t yi = svget2(coords, 1);
-
-    svuint32_t result;
-    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
-      result = get_pixels_or_border(pg32, xi, yi);
-    } else {
-      result = load_common<ScalarType, IsLarge>(pg32, xi, yi, sv_src_stride,
-                                                src_rows);
-    }
-    svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
-  };
-
-  auto calculate_linear = [&](svbool_t pg, uint32_t x) {
-    if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
-      svfloat32x2_t coords = coordinate_getter(pg, x);
-      return calculate_linear_replicated_border<ScalarType, IsLarge>(
-          pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows);
-    } else {
-      static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
-      svfloat32x2_t coords = coordinate_getter(pg, x);
-      return calculate_linear_constant_border<ScalarType, IsLarge>(
-          pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
-    }
-  };
-
-  auto process_row = [&](size_t y) {
-    float fy = static_cast<float>(y);
-    // Calculate half-transformed values at the first pixel (nominators)
-    // tw =  T6*x + T7*y + T8
-    // tx = (T0*x + T1*y + T2) / tw
-    // ty = (T3*x + T4*y + T5) / tw
-    tx0 = svdup_n_f32(fmaf(transform[1], fy, transform[2]));
-    ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5]));
-    tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8]));
-
-    Columns<ScalarType> dst = dst_rows.as_columns();
-    LoopUnroll2 loop{dst_width, kStep};
-    if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
-      loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); });
-      loop.unroll_once(
-          [&](size_t x) { vector_path_nearest_tail(x, x + kStep, dst); });
-      loop.remaining([&](size_t x, size_t length) {
-        vector_path_nearest_tail(x, length, dst);
-      });
-    } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) {
-      loop.unroll_four_times([&](size_t x) {
-        ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
-        svuint32_t res0 = calculate_linear(pg_all32, x);
-        x += kStep;
-        svuint32_t res1 = calculate_linear(pg_all32, x);
-        svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0),
-                                           svreinterpret_u16_u32(res1));
-        x += kStep;
-        res0 = calculate_linear(pg_all32, x);
-        x += kStep;
-        res1 = calculate_linear(pg_all32, x);
-        svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0),
-                                           svreinterpret_u16_u32(res1));
-        svst1_u8(svptrue_b8(), p_dst,
-                 svuzp1_u8(svreinterpret_u8_u16(result16_0),
-                           svreinterpret_u8_u16(result16_1)));
-      });
-      loop.unroll_once([&](size_t x) {
-        ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
-        svuint32_t result = calculate_linear(pg_all32, x);
-        svst1b_u32(pg_all32, p_dst, result);
-      });
-      loop.remaining([&](size_t x, size_t x_max) {
-        ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
-        svbool_t pg32 = svwhilelt_b32(x, x_max);
-        svuint32_t result = calculate_linear(pg32, x);
-        svst1b_u32(pg32, p_dst, result);
-      });
-    } else {
-      static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST ||
-                        Inter == KLEIDICV_INTERPOLATION_LINEAR,
-                    ": Unknown interpolation type!");
-    }
-  };
-
-  for (size_t y = y_begin; y < y_end; ++y) {
-    process_row(y);
-    ++dst_rows;
-  }
-}
-
-// Most of the complexity comes from parameter checking.
-// NOLINTBEGIN(readability-function-cognitive-complexity)
-template <typename T>
-KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc(
-    const T *src, size_t src_stride, size_t src_width, size_t src_height,
-    T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
-    size_t y_begin, size_t y_end, const float transform[9], size_t channels,
-    kleidicv_interpolation_type_t interpolation,
-    kleidicv_border_type_t border_type, const T *border_value) {
-  CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
-  CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
-  CHECK_POINTERS(transform);
-  CHECK_IMAGE_SIZE(src_width, src_height);
-  CHECK_IMAGE_SIZE(dst_width, dst_height);
-  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
-    return KLEIDICV_ERROR_NULL_POINTER;
-  }
-
-  // Calculating in float32_t will only be precise until 24 bits, and
-  // multiplication can only be done with 32x32 bits
-  if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
-      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) ||
-      src_stride >= (1ULL << 32)) {
-    return KLEIDICV_ERROR_RANGE;
-  }
-
-  Rows<const T> src_rows{src, src_stride, channels};
-  Rows<T> dst_rows{dst, dst_stride, channels};
-  Rectangle rect{dst_width, dst_height};
-
-  dst_rows += y_begin;
-
-  remap32f_process_rows<T>(remap_image_is_large(src_rows, src_height),
-                           interpolation, border_type, src_rows, src_width,
-                           src_height, border_value, dst_rows, dst_width,
-                           y_begin, y_end, transform);
-
-  return KLEIDICV_OK;
-}
-// NOLINTEND(readability-function-cognitive-complexity)
-
-#define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE_SC(type)                         \
-  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t                           \
-  warp_perspective_stripe_sc<type>(                                            \
-      const type *src, size_t src_stride, size_t src_width, size_t src_height, \
-      type *dst, size_t dst_stride, size_t dst_width, size_t dst_height,       \
-      size_t y_begin, size_t y_end, const float transformation[9],             \
-      size_t channels, kleidicv_interpolation_type_t interpolation,            \
-      kleidicv_border_type_t border_type, const type *border_value)
-
-KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE_SC(uint8_t);
-
-#endif
-
-}  // namespace KLEIDICV_TARGET_NAMESPACE
diff --git a/kleidicv/src/transform/warp_perspective_sve2.cpp b/kleidicv/src/transform/warp_perspective_sve2.cpp
index 5e3649050..b1f3df8d2 100644
--- a/kleidicv/src/transform/warp_perspective_sve2.cpp
+++ b/kleidicv/src/transform/warp_perspective_sve2.cpp
@@ -1,23 +1,275 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include <arm_sve.h>
+
+#include <cassert>
+#include <cmath>
+
 #include "kleidicv/ctypes.h"
-#include "warp_perspective_sc.h"
+#include "kleidicv/sve2.h"
+#include "kleidicv/types.h"
+#include "transform_sve2.h"
 
 namespace kleidicv::sve2 {
 
+// Template for WarpPerspective transformation.
+// Destination pixels are filled from the source, by taking pixels using the
+// transformed coordinates that are calculated as follows:
+//
+//                    [ T0, T1, T2 ]   [ x ]
+//      (x',y',w') =  [ T3, T4, T5 ] * [ y ]
+//                    [ T6, T7, T8 ]   [ 1 ]
+//  then
+//
+//      xt = x' / w'
+//      yt = y' / w'
+//
+//  or putting it together:
+//
+//      xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8)
+//      yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8)
+//
+
+template <typename ScalarType, bool IsLarge,
+          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
+void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
+                         size_t src_height, const ScalarType *border_value,
+                         Rows<ScalarType> dst_rows, size_t dst_width,
+                         size_t y_begin, size_t y_end,
+                         const float transform[9]) {
+  svbool_t pg_all32 = svptrue_b32();
+  svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
+  svuint32_t sv_ymax = svdup_n_u32(src_height - 1);
+  svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride());
+  svuint32_t sv_border;
+  // sv_border is only used if the border type is constant.
+  // If the border type is not constant then border_value is permitted to be
+  // null and must not be read.
+  if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+    sv_border = svdup_n_u32(border_value[0]);
+  }
+
+  svfloat32_t xmaxf = svdup_n_f32(static_cast<float>(src_width - 1));
+  svfloat32_t ymaxf = svdup_n_f32(static_cast<float>(src_height - 1));
+
+  const size_t kStep = VecTraits<float>::num_lanes();
+
+  svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1));
+  svfloat32_t T0 = svdup_n_f32(transform[0]);
+  svfloat32_t T3 = svdup_n_f32(transform[3]);
+  svfloat32_t T6 = svdup_n_f32(transform[6]);
+  svfloat32_t tx0, ty0, tw0;
+
+  auto calc_coords = [&](svbool_t, size_t x) {
+    svfloat32_t vx = svadd_n_f32_x(pg_all32, sv_0123, static_cast<float>(x));
+    // Calculate half-transformed values from the first few pixel values,
+    // plus Tn*x, similarly to the one above
+    // Calculate inverse weight because division is expensive
+    svfloat32_t iw =
+        svdiv_f32_x(pg_all32, svdup_n_f32(1.F), svmla_x(pg_all32, tw0, vx, T6));
+    svfloat32_t tx = svmla_x(pg_all32, tx0, vx, T0);
+    svfloat32_t ty = svmla_x(pg_all32, ty0, vx, T3);
+
+    // Calculate coordinates into the source image
+    return svcreate2(svmul_f32_x(pg_all32, tx, iw),
+                     svmul_f32_x(pg_all32, ty, iw));
+  };
+
+  auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) {
+    svfloat32x2_t coords = calc_coords(pg32, x);
+    svfloat32_t xf = svget2(coords, 0);
+    svfloat32_t yf = svget2(coords, 1);
+
+    svuint32_t xi, yi;
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+      // Round to the nearest integer
+      xi = svreinterpret_u32_s32(
+          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf)));
+      yi = svreinterpret_u32_s32(
+          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf)));
+    } else {
+      // Round to the nearest integer, clamp it to within the dimensions of the
+      // source image (negative values are already saturated to 0)
+      xi = svmin_x(pg_all32,
+                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)),
+                   sv_xmax);
+      yi = svmin_x(pg_all32,
+                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)),
+                   sv_ymax);
+    }
+    return svcreate2(xi, yi);
+  };
+
+  auto get_pixels_or_border = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
+    svbool_t in_range =
+        svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
+    svuint32_t result =
+        load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows);
+    // Select between source pixels and border colour
+    return svsel_u32(in_range, result, sv_border);
+  };
+
+  auto vector_path_nearest_4x = [&](size_t x, Columns<ScalarType> dst) {
+    auto load_source = [&](svuint32x2_t coords) {
+      svuint32_t x = svget2(coords, 0);
+      svuint32_t y = svget2(coords, 1);
+      if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+        return get_pixels_or_border(pg_all32, x, y);
+      } else {
+        return load_xy<ScalarType, IsLarge>(pg_all32, x, y, sv_src_stride,
+                                            src_rows);
+      }
+    };
+    ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
+    svuint32_t res32_0 =
+        load_source(calculate_nearest_coordinates(pg_all32, x));
+    x += kStep;
+    svuint32_t res32_1 =
+        load_source(calculate_nearest_coordinates(pg_all32, x));
+    svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                    svreinterpret_u16_u32(res32_1));
+    x += kStep;
+    res32_0 = load_source(calculate_nearest_coordinates(pg_all32, x));
+    x += kStep;
+    res32_1 = load_source(calculate_nearest_coordinates(pg_all32, x));
+    svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                    svreinterpret_u16_u32(res32_1));
+    svuint8_t result =
+        svuzp1_u8(svreinterpret_u8_u16(result0), svreinterpret_u8_u16(result1));
+    svst1(svptrue_b8(), p_dst, result);
+  };
+
+  auto vector_path_nearest_tail = [&](size_t x, size_t x_max,
+                                      Columns<ScalarType> dst) {
+    size_t length = x_max - x;
+    svbool_t pg32 = svwhilelt_b32(0ULL, length);
+
+    svuint32x2_t coords = calculate_nearest_coordinates(pg32, x);
+    svuint32_t xi = svget2(coords, 0);
+    svuint32_t yi = svget2(coords, 1);
+
+    svuint32_t result;
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+      result = get_pixels_or_border(pg32, xi, yi);
+    } else {
+      result =
+          load_xy<ScalarType, IsLarge>(pg32, xi, yi, sv_src_stride, src_rows);
+    }
+    svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
+  };
+
+  auto calculate_linear = [&](svbool_t pg, uint32_t x) {
+    svfloat32x2_t coords = calc_coords(pg, x);
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
+      return calculate_linear_replicated_border<ScalarType, IsLarge>(
+          pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows);
+    } else {
+      static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
+      return calculate_linear_constant_border<ScalarType, IsLarge>(
+          pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
+    }
+  };
+
+  auto process_row = [&](size_t y) {
+    float fy = static_cast<float>(y);
+    // Calculate half-transformed values at the first pixel (nominators)
+    // tw =  T6*x + T7*y + T8
+    // tx = (T0*x + T1*y + T2) / tw
+    // ty = (T3*x + T4*y + T5) / tw
+    tx0 = svdup_n_f32(fmaf(transform[1], fy, transform[2]));
+    ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5]));
+    tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8]));
+
+    Columns<ScalarType> dst = dst_rows.as_columns();
+    LoopUnroll2 loop{dst_width, kStep};
+    if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
+      loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); });
+      loop.unroll_once(
+          [&](size_t x) { vector_path_nearest_tail(x, x + kStep, dst); });
+      loop.remaining([&](size_t x, size_t length) {
+        vector_path_nearest_tail(x, length, dst);
+      });
+    } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) {
+      loop.unroll_four_times([&](size_t x) {
+        ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svuint32_t res0 = calculate_linear(pg_all32, x);
+        x += kStep;
+        svuint32_t res1 = calculate_linear(pg_all32, x);
+        svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0),
+                                           svreinterpret_u16_u32(res1));
+        x += kStep;
+        res0 = calculate_linear(pg_all32, x);
+        x += kStep;
+        res1 = calculate_linear(pg_all32, x);
+        svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0),
+                                           svreinterpret_u16_u32(res1));
+        svst1_u8(svptrue_b8(), p_dst,
+                 svuzp1_u8(svreinterpret_u8_u16(result16_0),
+                           svreinterpret_u8_u16(result16_1)));
+      });
+      loop.unroll_once([&](size_t x) {
+        ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svuint32_t result = calculate_linear(pg_all32, x);
+        svst1b_u32(pg_all32, p_dst, result);
+      });
+      loop.remaining([&](size_t x, size_t x_max) {
+        ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svbool_t pg32 = svwhilelt_b32(x, x_max);
+        svuint32_t result = calculate_linear(pg32, x);
+        svst1b_u32(pg32, p_dst, result);
+      });
+    } else {
+      static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST ||
+                        Inter == KLEIDICV_INTERPOLATION_LINEAR,
+                    ": Unknown interpolation type!");
+    }
+  };
+
+  for (size_t y = y_begin; y < y_end; ++y) {
+    process_row(y);
+    ++dst_rows;
+  }
+}
+
 template <typename T>
-kleidicv_error_t warp_perspective_stripe(
+KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe(
     const T *src, size_t src_stride, size_t src_width, size_t src_height,
     T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
-    size_t y_begin, size_t y_end, const float transformation[9],
-    size_t channels, kleidicv_interpolation_type_t interpolation,
+    size_t y_begin, size_t y_end, const float transform[9], size_t channels,
+    kleidicv_interpolation_type_t interpolation,
     kleidicv_border_type_t border_type, const T *border_value) {
-  return warp_perspective_stripe_sc<T>(
-      src, src_stride, src_width, src_height, dst, dst_stride, dst_width,
-      dst_height, y_begin, y_end, transformation, channels, interpolation,
-      border_type, border_value);
+  CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
+  CHECK_POINTERS(transform);
+  CHECK_IMAGE_SIZE(src_width, src_height);
+  CHECK_IMAGE_SIZE(dst_width, dst_height);
+  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
+    return KLEIDICV_ERROR_NULL_POINTER;
+  }
+
+  // Calculating in float32_t will only be precise until 24 bits, and
+  // multiplication can only be done with 32x32 bits
+  // Empty source image is not supported
+  if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
+      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) ||
+      src_stride >= (1ULL << 32) || src_width == 0 || src_height == 0) {
+    return KLEIDICV_ERROR_RANGE;
+  }
+
+  Rows<const T> src_rows{src, src_stride, channels};
+  Rows<T> dst_rows{dst, dst_stride, channels};
+  Rectangle rect{dst_width, dst_height};
+
+  dst_rows += y_begin;
+
+  transform_operation<T>(is_image_large(src_rows, src_height), interpolation,
+                         border_type, src_rows, src_width, src_height,
+                         border_value, dst_rows, dst_width, y_begin, y_end,
+                         transform);
+
+  return KLEIDICV_OK;
 }
 
 #define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(type)                            \
diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp
index 7a18a0bcf..d3bde250c 100644
--- a/test/api/test_remap.cpp
+++ b/test/api/test_remap.cpp
@@ -599,8 +599,8 @@ class RemapS16Point5 : public testing::Test {
     EXPECT_EQ_ARRAY2D(actual, expected);
   }
 
-  static ScalarType lerp2d(size_t cx, size_t cy, ScalarType a, ScalarType b,
-                           ScalarType c, ScalarType d) {
+  static ScalarType lerp_2d(size_t cx, size_t cy, ScalarType a, ScalarType b,
+                            ScalarType c, ScalarType d) {
     size_t inv_cx = FRAC_MAX - cx, inv_cy = FRAC_MAX - cy;
     ScalarType r = static_cast<ScalarType>((inv_cx * inv_cy * a +
                                             cx * inv_cy * b + inv_cx * cy * c +
@@ -635,8 +635,8 @@ class RemapS16Point5 : public testing::Test {
           const int16_t *coords = mapxy.at(row, column * 2);
           int16_t x = coords[0], y = coords[1];
           *expected.at(row, column * src.channels() + ch) =
-              lerp2d(x_frac, y_frac, get_src(x, y)[ch], get_src(x + 1, y)[ch],
-                     get_src(x, y + 1)[ch], get_src(x + 1, y + 1)[ch]);
+              lerp_2d(x_frac, y_frac, get_src(x, y)[ch], get_src(x + 1, y)[ch],
+                      get_src(x, y + 1)[ch], get_src(x + 1, y + 1)[ch]);
         }
       }
     }
@@ -1269,8 +1269,8 @@ TYPED_TEST(RemapF32, ZeroHeightImage) {
   const size_t src_stride = kW * sizeof(TypeParam);
   const size_t big_stride = (1UL << 32UL) - sizeof(TypeParam);
   const size_t dst_stride = kW * sizeof(TypeParam);
-  float mapx[kW] = {};
-  float mapy[kW] = {};
+  float mapx[kW] = {-0.2, 0.3, 1.4, 2.5};
+  float mapy[kW] = {-1.8, -0.7, 0.6, 1.3};
   const size_t mapx_stride = kW * sizeof(float);
   const size_t mapy_stride = kW * sizeof(float);
 
@@ -1288,7 +1288,7 @@ TYPED_TEST(RemapF32, ZeroHeightImage) {
                                      border_type, border_value));
   }
   const TypeParam border_value[1] = {0};
-  EXPECT_EQ(KLEIDICV_OK,
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
             remap_f32<TypeParam>()(
                 src, src_stride, kW, 0, dst, dst_stride, kW, 1, 1, mapx,
                 mapx_stride, mapy, mapy_stride, KLEIDICV_INTERPOLATION_LINEAR,
diff --git a/test/api/test_warp_perspective.cpp b/test/api/test_warp_perspective.cpp
index db5004de5..7efc85ab9 100644
--- a/test/api/test_warp_perspective.cpp
+++ b/test/api/test_warp_perspective.cpp
@@ -724,8 +724,8 @@ class WarpPerspectiveLinear : public testing::Test {
             INT_MIN, std::min<double>(floor(fy), KLEIDICV_MAX_IMAGE_PIXELS)));
         ptrdiff_t ix1 = ix0 + 1;
         ptrdiff_t iy1 = iy0 + 1;
-        double xfrac = std::isfinite(fx) ? fx - floor(fx) : 0.0;
-        double yfrac = std::isfinite(fy) ? fy - floor(fy) : 0.0;
+        double xfrac = fx - floor(fx);
+        double yfrac = fy - floor(fy);
         for (size_t ch = 0; ch < src.channels(); ++ch) {
           double a = get_src(ix0, iy0)[ch];
           double b = get_src(ix1, iy0)[ch];
@@ -844,15 +844,16 @@ TYPED_TEST(WarpPerspectiveLinear, RandomTransform) {
   float transform[9];
   // Not entirely random, as very small and very big floats (in absolute value)
   // cause too big errors and they are far from being valid use cases anyway
-  test::PseudoRandomNumberGeneratorIntRange<size_t> exponentGenerator(-7, 7);
-  test::PseudoRandomNumberGeneratorFloatRange<float> mantissaGenerator(-1.0,
+  test::PseudoRandomNumberGeneratorIntRange<int> exponentGenerator(-7, 7);
+  test::PseudoRandomNumberGeneratorIntRange<int> signGenerator(0, 1);
+  test::PseudoRandomNumberGeneratorFloatRange<float> mantissaGenerator(0.01,
                                                                        1.0);
   for (size_t cc = 0; cc < 100; ++cc) {
     for (size_t i = 0; i < 9; ++i) {
       transform[i] =
-          mantissaGenerator.next().value_or(1.0) *
-          static_cast<float>(
-              exp(static_cast<double>(exponentGenerator.next().value_or(1.0))));
+          mantissaGenerator.next().value_or(1.0F) *
+          (2.0F * static_cast<float>(signGenerator.next().value_or(0)) - 1.0F) *
+          expf(static_cast<float>(exponentGenerator.next().value_or(1)));
     }
 
     size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
@@ -875,13 +876,26 @@ TYPED_TEST(WarpPerspectiveLinear, DivisionByZero) {
   };
   // clang-format on
 
-  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
-  size_t src_h = 4;
-  size_t dst_w = src_w;
-  size_t dst_h = src_h;
+  size_t kW = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t kH = 2;
+
+  test::Array2D<TypeParam> source{kW, kH, 1, 1};
+  test::Array2D<TypeParam> dst{kW, kH, 1, 1};
+
+  for (int64_t y = 0; y < static_cast<int64_t>(source.height()); ++y) {
+    for (int64_t x = 0; x < static_cast<int64_t>(source.width()); ++x) {
+      const int64_t kMaxVal = std::numeric_limits<TypeParam>::max() / 2;
+      *source.at(y, x) =
+          kMaxVal / 4 + abs((x + y) % (2 * kMaxVal + 1) - kMaxVal);
+    }
+  }
+
   for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1,
-                      border_type, border_value, 3);
+    EXPECT_EQ(KLEIDICV_OK,
+              kleidicv_warp_perspective_u8(
+                  source.data(), source.stride(), kW, kH, dst.data(),
+                  dst.stride(), kW, kH, transform_div_by_zero, 1,
+                  KLEIDICV_INTERPOLATION_LINEAR, border_type, border_value));
   }
 }
 
-- 
GitLab