From 3e45dae766b76c96827f4768069aa8fa0dd47c8c Mon Sep 17 00:00:00 2001
From: Denes Tarjan <denes.tarjan@arm.com>
Date: Thu, 27 Feb 2025 17:44:51 +0000
Subject: [PATCH] Implement Remap Float32 for 2 channels

---
 CHANGELOG.md                                  |   2 +-
 adapters/opencv/kleidicv_hal.cpp              |  13 +-
 conformity/opencv/test_remap.cpp              |  25 +-
 doc/functionality.md                          |  11 +-
 doc/opencv.md                                 |   5 +-
 kleidicv/include/kleidicv/kleidicv.h          |   2 +-
 kleidicv/include/kleidicv/transform/remap.h   |  22 +-
 kleidicv/include/kleidicv/types.h             |   5 +
 kleidicv/src/transform/remap_f32_neon.cpp     | 206 +++++----
 kleidicv/src/transform/remap_f32_sve2.cpp     | 367 ++++++++++------
 kleidicv/src/transform/transform_common.h     |  18 +
 kleidicv/src/transform/transform_neon.h       | 414 ++++++++++++++----
 kleidicv/src/transform/transform_sve2.h       | 257 ++++++++---
 .../src/transform/warp_perspective_neon.cpp   |  21 +-
 .../src/transform/warp_perspective_sve2.cpp   |  18 +-
 kleidicv_thread/src/kleidicv_thread.cpp       |   8 +-
 scripts/benchmark/benchmarks.txt              |  16 +-
 test/api/test_remap.cpp                       | 168 +++----
 test/api/test_thread.cpp                      |  23 +-
 19 files changed, 1096 insertions(+), 505 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d3213a9b..ffaf5232d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,7 +22,7 @@ This changelog aims to follow the guiding principles of
   - Fixed-point coordinates with linear interpolation
   - Floating-point coordinates with nearest neighbour and linear interpolation
   - Replicated and constant borders
-  - 1-channel only
+  - 1 channel source image for integer and fixed-points coordinates, 1 and 2 channels for floating-point coordinates
   - u8 and u16 images
 - WarpPerspective implementation
   - Nearest and Linear interpolation method, for 1-channel u8 input.
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index e73c350fa..8a897d1e4 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -1393,24 +1393,25 @@ int remap_f32(int src_type, const uchar *src_data, size_t src_step,
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
   }
 
+  size_t channels = (src_type >> CV_CN_SHIFT) + 1;
   auto mt = get_multithreading();
 
-  if (src_type == CV_8UC1) {
+  if (CV_MAT_DEPTH(src_type) == CV_8U) {
     auto border_value = get_border_value<uint8_t>(border_value_f64);
     return convert_error(kleidicv_thread_remap_f32_u8(
         src_data, src_step, static_cast<size_t>(src_width),
         static_cast<size_t>(src_height), dst_data, dst_step,
-        static_cast<size_t>(dst_width), static_cast<size_t>(dst_height), 1,
-        mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type,
+        static_cast<size_t>(dst_width), static_cast<size_t>(dst_height),
+        channels, mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type,
         kleidicv_border_type, border_value.data(), mt));
-  } else if (src_type == CV_16UC1) {
+  } else if (CV_MAT_DEPTH(src_type) == CV_16UC1) {
     auto border_value = get_border_value<uint16_t>(border_value_f64);
     return convert_error(kleidicv_thread_remap_f32_u16(
         reinterpret_cast<const uint16_t *>(src_data), src_step,
         static_cast<size_t>(src_width), static_cast<size_t>(src_height),
         reinterpret_cast<uint16_t *>(dst_data), dst_step,
-        static_cast<size_t>(dst_width), static_cast<size_t>(dst_height), 1,
-        mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type,
+        static_cast<size_t>(dst_width), static_cast<size_t>(dst_height),
+        channels, mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type,
         kleidicv_border_type, border_value.data(), mt));
   }
 
diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp
index face699ec..f80e1ab3d 100644
--- a/conformity/opencv/test_remap.cpp
+++ b/conformity/opencv/test_remap.cpp
@@ -254,14 +254,23 @@ std::vector<test>& remap_tests_get() {
     TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
     TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
 
-    TEST("RemapF32 uint8 Replicate Linear", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
-    TEST("RemapF32 uint16 Replicate Linear", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
-    TEST("RemapF32 uint8 Constant Linear", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
-    TEST("RemapF32 uint16 Constant Linear", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>)),
-    TEST("RemapF32 uint8 Replicate Nearest", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
-    TEST("RemapF32 uint16 Replicate Nearest", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
-    TEST("RemapF32 uint8 Constant Nearest", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>)),
-    TEST("RemapF32 uint16 Constant Nearest", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 123210>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 123210>)),
+    TEST("RemapF32 uint8 Replicate Linear 1ch", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint16 Replicate Linear 1ch", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint8 Constant Linear 1ch", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
+    TEST("RemapF32 uint16 Constant Linear 1ch", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>)),
+    TEST("RemapF32 uint8 Replicate Nearest 1ch", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint16 Replicate Nearest 1ch", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint8 Constant Nearest 1ch", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>)),
+    TEST("RemapF32 uint16 Constant Nearest 1ch", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 123210>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 123210>)),
+
+    TEST("RemapF32 uint8 Replicate Linear 2ch", (test_remap_f32<uint8_t, CV_8UC2, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint8_t, CV_8UC2, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint16 Replicate Linear 2ch", (test_remap_f32<uint16_t, CV_16UC2, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint16_t, CV_16UC2, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint8 Constant Linear 2ch", (test_remap_f32<uint8_t, CV_8UC2, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_f32<uint8_t, CV_8UC2, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
+    TEST("RemapF32 uint16 Constant Linear 2ch", (test_remap_f32<uint16_t, CV_16UC2, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>), (exec_remap_f32<uint16_t, CV_16UC2, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>)),
+    TEST("RemapF32 uint8 Replicate Nearest 2ch", (test_remap_f32<uint8_t, CV_8UC2, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint8_t, CV_8UC2, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint16 Replicate Nearest 2ch", (test_remap_f32<uint16_t, CV_16UC2, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint16_t, CV_16UC2, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint8 Constant Nearest 2ch", (test_remap_f32<uint8_t, CV_8UC2, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>), (exec_remap_f32<uint8_t, CV_8UC2, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>)),
+    TEST("RemapF32 uint16 Constant Nearest 2ch", (test_remap_f32<uint16_t, CV_16UC2, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 123210>), (exec_remap_f32<uint16_t, CV_16UC2, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 123210>)),
   };
   // clang-format on
   return tests;
diff --git a/doc/functionality.md b/doc/functionality.md
index 83c8fdc71..3d178946f 100644
--- a/doc/functionality.md
+++ b/doc/functionality.md
@@ -93,11 +93,12 @@ See `doc/opencv.md` for details of the functionality available in OpenCV.
 | 8x8         |     |  x  |
 
 # Remap
-|                                            |  u8 | u16 |
-|--------------------------------------------|-----|-----|
-| Remap int16 coordinates                    |  x  |  x  |
-| Remap int16+uint16 fixed-point coordinates |  x  |  x  |
-| Remap float32 coordinates                  |  x  |  x  |
+|                                                   |  1ch u8 | 1ch u16 | 2ch u8 | 2ch u16 |
+|---------------------------------------------------|---------|---------|--------|---------|
+| Remap int16 coordinates                           |    x    |    x    |        |         |
+| Remap int16+uint16 fixed-point coordinates        |    x    |    x    |        |         |
+| Remap float32 coordinates - nearest interpolation |    x    |    x    |    x   |    x    |
+| Remap float32 coordinates - linear interpolation  |    x    |    x    |    x   |    x    |
 
 # WarpPerspective
 |                      |  u8 |
diff --git a/doc/opencv.md b/doc/opencv.md
index a972dbdac..dfc154e09 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -208,9 +208,10 @@ Geometrically transforms the `src` image by taking the pixels specified by the c
 Notes on parameters:
 * `src.step` - must be less than 65536 * element size.
 * `src.width`, `src_height` - must not be greater than 32768.
-* `src.type()` - supports `CV_8UC1` and `CV_16UC1`.
+* `src.type()` - supports `CV_8UC1` and `CV_16UC1`. With `CV_32FC1` map config, it supports `CV_8UC2` and `CV_16UC2` as well.
 * `dst.cols` - must be at least 4 (32FC1-type maps) or 8 (16SC2-type maps)
-* `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT`. \
+* `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT`.
+
 Supported map configurations:
 * `map1.type()` is `CV_16SC2` and `map2` is empty:
   * > ⚠️ **Acceleration will not work unless OpenCV is built from source patched with `opencv-4.11.patch`**
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index 2bed6acfd..6a5ff2658 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1861,7 +1861,7 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u16, const uint16_t *src,
 ///                       least 4.
 /// @param dst_height     Number of rows in the destination data.
 /// @param channels       Number of channels in the (source and destination)
-///                       data. Must be 1.
+///                       data. Can be 1 or 2.
 /// @param mapx           Pointer to the x coordinates' data. Must be non-null.
 /// @param mapx_stride    Distance in bytes from the start of one row to the
 ///                       start of the next row for `mapx`. Must be a multiple
diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h
index 15b1654b6..ea72f5666 100644
--- a/kleidicv/include/kleidicv/transform/remap.h
+++ b/kleidicv/include/kleidicv/transform/remap.h
@@ -53,21 +53,19 @@ inline bool remap_s16point5_is_implemented(
 template <typename T>
 inline bool remap_f32_is_implemented(
     size_t src_stride, size_t src_width, size_t src_height, size_t dst_width,
-    kleidicv_border_type_t border_type, size_t channels,
+    size_t dst_height, kleidicv_border_type_t border_type, size_t channels,
     kleidicv_interpolation_type_t interpolation) KLEIDICV_STREAMING_COMPATIBLE {
   if constexpr (std::is_same<T, uint8_t>::value ||
                 std::is_same<T, uint16_t>::value) {
-    return (
-        src_stride <= std::numeric_limits<uint32_t>::max() && dst_width >= 4 &&
-        src_width <=
-            static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) + 1 &&
-        src_height <=
-            static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) + 1 &&
-        (border_type == KLEIDICV_BORDER_TYPE_REPLICATE ||
-         border_type == KLEIDICV_BORDER_TYPE_CONSTANT) &&
-        channels == 1 &&
-        (interpolation == KLEIDICV_INTERPOLATION_LINEAR ||
-         interpolation == KLEIDICV_INTERPOLATION_NEAREST));
+    return (src_stride <= std::numeric_limits<uint32_t>::max() &&
+            dst_width >= 4 && src_width < (1ULL << 24) &&
+            src_height < (1ULL << 24) && dst_width < (1ULL << 24) &&
+            dst_height < (1ULL << 24) && src_width > 0 && src_height > 0 &&
+            (border_type == KLEIDICV_BORDER_TYPE_REPLICATE ||
+             border_type == KLEIDICV_BORDER_TYPE_CONSTANT) &&
+            (channels == 1 || channels == 2) &&
+            (interpolation == KLEIDICV_INTERPOLATION_LINEAR ||
+             interpolation == KLEIDICV_INTERPOLATION_NEAREST));
   } else {
     return false;
   }
diff --git a/kleidicv/include/kleidicv/types.h b/kleidicv/include/kleidicv/types.h
index 831507833..e1588cc11 100644
--- a/kleidicv/include/kleidicv/types.h
+++ b/kleidicv/include/kleidicv/types.h
@@ -146,6 +146,11 @@ class Columns final {
                       channels()};
   }
 
+  // Returns a pointer to a given column.
+  [[nodiscard]] T *ptr_at(ptrdiff_t column) KLEIDICV_STREAMING_COMPATIBLE {
+    return ptr_ + column * static_cast<ptrdiff_t>(channels());
+  }
+
   // Returns the number of channels in a row.
   size_t channels() const KLEIDICV_STREAMING_COMPATIBLE { return channels_; }
 
diff --git a/kleidicv/src/transform/remap_f32_neon.cpp b/kleidicv/src/transform/remap_f32_neon.cpp
index 18f517c14..9996840fd 100644
--- a/kleidicv/src/transform/remap_f32_neon.cpp
+++ b/kleidicv/src/transform/remap_f32_neon.cpp
@@ -2,8 +2,6 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <arm_neon.h>
-
 #include <cassert>
 
 #include "kleidicv/ctypes.h"
@@ -13,46 +11,33 @@
 
 namespace kleidicv::neon {
 
-template <typename ScalarType, bool IsLarge>
-void remap_f32_nearest_replicate(
-    uint32x4_t v_xmax, uint32x4_t v_ymax, uint32x4_t v_src_stride,
-    Rows<const ScalarType> src_rows, Columns<ScalarType> dst, size_t dst_width,
-    Columns<const float> mapx, Columns<const float> mapy, const size_t kStep) {
-  LoopUnroll2<TryToAvoidTailLoop> loop{dst_width, kStep};
-  loop.unroll_once([&](size_t x) {
-    transform_pixels_replicate<ScalarType, IsLarge>(
-        vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x]), v_xmax, v_ymax, v_src_stride,
-        src_rows, dst.at(x));
-  });
-}
-
-template <typename ScalarType, bool IsLarge>
-void remap_f32_nearest_constant(uint32x4_t v_xmax, uint32x4_t v_ymax,
-                                uint32x4_t v_src_stride,
-                                Rows<const ScalarType> src_rows,
-                                Columns<ScalarType> dst, size_t dst_width,
-                                Columns<const float> mapx,
-                                Columns<const float> mapy, const size_t kStep,
-                                ScalarType border_value) {
+template <typename ScalarType, bool IsLarge, size_t Channels,
+          kleidicv_border_type_t Border>
+void remap_f32_nearest(uint32x4_t v_xmax, uint32x4_t v_ymax,
+                       uint32x4_t v_src_stride, Rows<const ScalarType> src_rows,
+                       Columns<ScalarType> dst, size_t dst_width,
+                       Columns<const float> mapx, Columns<const float> mapy,
+                       const size_t kStep, const ScalarType *border_values) {
   LoopUnroll2<TryToAvoidTailLoop> loop{dst_width, kStep};
   loop.unroll_once([&](size_t x) {
-    transform_pixels_constant<ScalarType, IsLarge>(
+    transform_pixels<ScalarType, IsLarge, Channels, Border>(
         vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x]), v_xmax, v_ymax, v_src_stride,
-        src_rows, dst.at(x), border_value);
+        src_rows, dst.at(x), border_values);
   });
 }
 
-template <typename ScalarType, bool IsLarge, kleidicv_border_type_t Border>
+template <typename ScalarType, bool IsLarge, kleidicv_border_type_t Border,
+          size_t Channels>
 void remap_f32_linear(uint32x4_t v_xmax, uint32x4_t v_ymax,
                       uint32x4_t v_src_stride, Rows<const ScalarType> src_rows,
                       Columns<ScalarType> dst, size_t dst_width,
                       Columns<const float> mapx, Columns<const float> mapy,
-                      const size_t kStep, ScalarType border_value) {
+                      const size_t kStep, const ScalarType *border_values) {
   auto load_xy = [&](size_t x) {
     return FloatVectorPair{vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x])};
   };
 
-  auto vector_path = [&](size_t x) {
+  auto vector_path_1ch = [&](size_t x) {
     float32x4_t a, b, c, d, xfrac, yfrac;
     if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
       load_quad_pixels_replicate<ScalarType, IsLarge>(
@@ -61,62 +46,113 @@ void remap_f32_linear(uint32x4_t v_xmax, uint32x4_t v_ymax,
     } else {
       static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
       load_quad_pixels_constant<ScalarType, IsLarge>(
-          load_xy(x), v_xmax, v_ymax, v_src_stride, border_value, src_rows,
+          load_xy(x), v_xmax, v_ymax, v_src_stride, border_values, src_rows,
           xfrac, yfrac, a, b, c, d);
     }
     return lerp_2d(xfrac, yfrac, a, b, c, d);
   };
 
+  auto vector_path_2ch = [&](size_t x) {
+    float32x4x2_t a, b, c, d;
+    float32x4_t xfrac, yfrac;
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
+      load_quad_pixels_replicate_2ch<ScalarType, IsLarge>(
+          load_xy(x), v_xmax, v_ymax, v_src_stride, src_rows, xfrac, yfrac, a,
+          b, c, d);
+    } else {
+      static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
+      load_quad_pixels_constant_2ch<ScalarType, IsLarge>(
+          load_xy(x), v_xmax, v_ymax, v_src_stride, border_values, src_rows,
+          xfrac, yfrac, a, b, c, d);
+    }
+    float32x4_t xfrac_low = vzip1q_f32(xfrac, xfrac);
+    float32x4_t yfrac_low = vzip1q_f32(yfrac, yfrac);
+    float32x4_t xfrac_high = vzip2q_f32(xfrac, xfrac);
+    float32x4_t yfrac_high = vzip2q_f32(yfrac, yfrac);
+
+    uint32x4_t result_low =
+        lerp_2d(xfrac_low, yfrac_low, a.val[0], b.val[0], c.val[0], d.val[0]);
+    uint32x4_t result_high =
+        lerp_2d(xfrac_high, yfrac_high, a.val[1], b.val[1], c.val[1], d.val[1]);
+    return vuzp1q_u16(result_low, result_high);
+  };
+
   LoopUnroll2<TryToAvoidTailLoop> loop{dst_width, kStep};
 
-  if constexpr (std::is_same<ScalarType, uint8_t>::value) {
-    loop.unroll_four_times([&](size_t x) {
-      ScalarType *p_dst = &dst[x];
-      uint32x4_t res0 = vector_path(x);
-      x += kStep;
-      uint32x4_t res1 = vector_path(x);
-      uint16x8_t result16_0 = vuzp1q_u16(res0, res1);
-
-      x += kStep;
-      res0 = vector_path(x);
-      x += kStep;
-      res1 = vector_path(x);
-      uint16x8_t result16_1 = vuzp1q_u16(res0, res1);
-
-      vst1q_u8(p_dst, vuzp1q_u8(result16_0, result16_1));
-    });
-    loop.unroll_once([&](size_t x) {
-      uint32x4_t result = vector_path(x);
-      dst[x] = vgetq_lane_u32(result, 0);
-      dst[x + 1] = vgetq_lane_u32(result, 1);
-      dst[x + 2] = vgetq_lane_u32(result, 2);
-      dst[x + 3] = vgetq_lane_u32(result, 3);
-    });
-  } else if constexpr (std::is_same<ScalarType, uint16_t>::value) {
-    loop.unroll_twice([&](size_t x) {
-      ScalarType *p_dst = &dst[x];
-      uint32x4_t res0 = vector_path(x);
-      x += kStep;
-      uint32x4_t res1 = vector_path(x);
-      vst1q_u16(p_dst, vuzp1q_u16(res0, res1));
-    });
-    loop.unroll_once([&](size_t x) {
-      uint32x4_t result = vector_path(x);
-      uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result));
-      vst1_u16(&dst[x], result16);
-    });
+  if constexpr (Channels == 1) {
+    if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+      loop.unroll_four_times([&](size_t x) {
+        ScalarType *p_dst = &dst[x];
+        uint32x4_t res0 = vector_path_1ch(x);
+        x += kStep;
+        uint32x4_t res1 = vector_path_1ch(x);
+        uint16x8_t result16_0 = vuzp1q_u16(res0, res1);
+
+        x += kStep;
+        res0 = vector_path_1ch(x);
+        x += kStep;
+        res1 = vector_path_1ch(x);
+        uint16x8_t result16_1 = vuzp1q_u16(res0, res1);
+
+        vst1q_u8(p_dst, vuzp1q_u8(result16_0, result16_1));
+      });
+      loop.unroll_once([&](size_t x) {
+        uint8x16_t result = vreinterpretq_u8_u32(vector_path_1ch(x));
+        dst[x] = vgetq_lane_u8(result, 0);
+        dst[x + 1] = vgetq_lane_u8(result, 4);
+        dst[x + 2] = vgetq_lane_u8(result, 8);
+        dst[x + 3] = vgetq_lane_u8(result, 12);
+      });
+    }
+    if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+      loop.unroll_twice([&](size_t x) {
+        ScalarType *p_dst = dst.ptr_at(x);
+        uint32x4_t res0 = vector_path_1ch(x);
+        x += kStep;
+        uint32x4_t res1 = vector_path_1ch(x);
+        vst1q_u16(p_dst, vuzp1q_u16(res0, res1));
+      });
+      loop.unroll_once([&](size_t x) {
+        uint32x4_t result = vector_path_1ch(x);
+        uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result));
+        vst1_u16(dst.ptr_at(x), result16);
+      });
+    }
+  }
+  if constexpr (Channels == 2) {
+    if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+      loop.unroll_twice([&](size_t x) {
+        ScalarType *p_dst = dst.ptr_at(x);
+        uint16x8_t result16_0 = vector_path_2ch(x);
+        x += kStep;
+        uint16x8_t result16_1 = vector_path_2ch(x);
+        vst1q_u8(p_dst, vuzp1q_u8(vreinterpretq_u8_u16(result16_0),
+                                  vreinterpretq_u8_u16(result16_1)));
+      });
+      loop.unroll_once([&](size_t x) {
+        uint16x8_t result = vector_path_2ch(x);
+        vst1_u8(dst.ptr_at(x), vmovn_u16(result));
+      });
+    }
+    if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+      loop.unroll_once([&](size_t x) {
+        uint16x8_t result = vector_path_2ch(x);
+        vst1q_u16(dst.ptr_at(x), result);
+      });
+    }
   }
 }
 
 template <typename ScalarType, bool IsLarge,
-          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
+          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border,
+          size_t Channels>
 void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
-                         size_t src_height, const ScalarType *border_value,
+                         size_t src_height, const ScalarType *border_values,
                          Rows<ScalarType> dst_rows, size_t dst_width,
                          size_t y_begin, size_t y_end,
                          Rows<const float> mapx_rows,
                          Rows<const float> mapy_rows) {
-  uint32x4_t v_src_stride = vdupq_n_u32(
+  uint32x4_t v_src_element_stride = vdupq_n_u32(
       static_cast<uint32_t>(src_rows.stride() / sizeof(ScalarType)));
   uint32x4_t v_xmax = vdupq_n_u32(static_cast<uint32_t>(src_width - 1));
   uint32x4_t v_ymax = vdupq_n_u32(static_cast<uint32_t>(src_height - 1));
@@ -124,23 +160,16 @@ void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
 
   for (size_t y = y_begin; y < y_end; ++y) {
     if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
-      if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
-        remap_f32_nearest_replicate<ScalarType, IsLarge>(
-            v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(),
-            dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep);
-      } else {
-        static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
-        remap_f32_nearest_constant<ScalarType, IsLarge>(
-            v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(),
-            dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep,
-            border_value[0]);
-      }
+      remap_f32_nearest<ScalarType, IsLarge, Channels, Border>(
+          v_xmax, v_ymax, v_src_element_stride, src_rows, dst_rows.as_columns(),
+          dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep,
+          border_values);
     } else {
       static_assert(Inter == KLEIDICV_INTERPOLATION_LINEAR);
-      remap_f32_linear<ScalarType, IsLarge, Border>(
-          v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(),
+      remap_f32_linear<ScalarType, IsLarge, Border, Channels>(
+          v_xmax, v_ymax, v_src_element_stride, src_rows, dst_rows.as_columns(),
           dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep,
-          Border == KLEIDICV_BORDER_TYPE_CONSTANT ? border_value[0] : 0);
+          border_values);
     }
     ++mapx_rows;
     ++mapy_rows;
@@ -170,18 +199,11 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
   }
 
   if (!remap_f32_is_implemented<T>(src_stride, src_width, src_height, dst_width,
-                                   border_type, channels, interpolation)) {
+                                   dst_height, border_type, channels,
+                                   interpolation)) {
     return KLEIDICV_ERROR_NOT_IMPLEMENTED;
   }
 
-  // Calculating in float32_t will only be precise until 24 bits
-  if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
-      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) ||
-      // Empty source image is not supported
-      src_width == 0 || src_height == 0) {
-    return KLEIDICV_ERROR_RANGE;
-  }
-
   Rows<const T> src_rows{src, src_stride, channels};
   Rows<const float> mapx_rows{mapx, mapx_stride, 1};
   Rows<const float> mapy_rows{mapy, mapy_stride, 1};
@@ -189,7 +211,7 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
   Rectangle rect{dst_width, dst_height};
 
   transform_operation<T>(is_image_large(src_rows, src_height), interpolation,
-                         border_type, src_rows, src_width, src_height,
+                         border_type, channels, src_rows, src_width, src_height,
                          border_value, dst_rows, dst_width, 0, dst_height,
                          mapx_rows, mapy_rows);
 
diff --git a/kleidicv/src/transform/remap_f32_sve2.cpp b/kleidicv/src/transform/remap_f32_sve2.cpp
index e4ad3e724..ce05c2170 100644
--- a/kleidicv/src/transform/remap_f32_sve2.cpp
+++ b/kleidicv/src/transform/remap_f32_sve2.cpp
@@ -2,8 +2,6 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <arm_sve.h>
-
 #include <cassert>
 #include <cmath>
 #include <cstddef>
@@ -15,33 +13,44 @@
 
 namespace kleidicv::sve2 {
 
-template <typename ScalarType, bool IsLarge, kleidicv_border_type_t Border>
+template <typename ScalarType, bool IsLarge, kleidicv_border_type_t Border,
+          size_t Channels>
 void remap_f32_nearest(svuint32_t sv_xmax, svuint32_t sv_ymax,
                        svuint32_t sv_src_stride,
                        Rows<const ScalarType> src_rows, svuint32_t sv_border,
                        Columns<ScalarType> dst, size_t kStep, size_t dst_width,
-                       Rows<const float> mapx_rows,
-                       Rows<const float> mapy_rows) {
+                       Rows<const float> mapx_rows, Rows<const float> mapy_rows,
+                       [[maybe_unused]] svuint8_t load_table_2ch) {
   svbool_t pg_all32 = svptrue_b32();
+  svbool_t pg_all16 = svptrue_b16();
   auto load_coords = [&](svbool_t pg, size_t xs) {
     auto x = static_cast<ptrdiff_t>(xs);
     return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]),
                      svld1_f32(pg, &mapy_rows.as_columns()[x]));
   };
 
+  auto load_source = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
+    if constexpr (Channels == 1) {
+      return load_xy<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows);
+    }
+    if constexpr (Channels == 2) {
+      return load_xy_2ch<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows,
+                                              load_table_2ch);
+    }
+  };
+
   auto get_pixels = [&](svbool_t pg, svuint32x2_t coords) {
     svuint32_t x = svget2(coords, 0);
     svuint32_t y = svget2(coords, 1);
     if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
       svbool_t in_range = svand_b_z(pg, svcmple_u32(pg, x, sv_xmax),
                                     svcmple_u32(pg, y, sv_ymax));
-      svuint32_t result =
-          load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows);
+      svuint32_t result = load_source(in_range, x, y);
       // Select between source pixels and border colour
       return svsel_u32(in_range, result, sv_border);
     } else {
       static_assert(Border == KLEIDICV_BORDER_TYPE_REPLICATE);
-      return load_xy<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows);
+      return load_source(pg, x, y);
     }
   };
 
@@ -76,77 +85,131 @@ void remap_f32_nearest(svuint32_t sv_xmax, svuint32_t sv_ymax,
 
   LoopUnroll2 loop{dst_width, kStep};
 
-  if constexpr (std::is_same<ScalarType, uint8_t>::value) {
-    auto vector_path_generic = [&](size_t x, size_t x_max,
-                                   Columns<ScalarType> dst) {
-      size_t length = x_max - x;
-      svbool_t pg32 = svwhilelt_b32(0ULL, length);
-      svuint32_t result =
-          get_pixels(pg32, calculate_nearest_coordinates(pg32, x));
-      svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
-    };
-
-    loop.unroll_four_times([&](size_t x) {
-      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
-      svuint32_t res32_0 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      x += kStep;
-      svuint32_t res32_1 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                      svreinterpret_u16_u32(res32_1));
-      x += kStep;
-      res32_0 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      x += kStep;
-      res32_1 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                      svreinterpret_u16_u32(res32_1));
-      svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0),
-                                   svreinterpret_u8_u16(result1));
-      svst1(svptrue_b8(), p_dst, result);
-    });
-    loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); });
-    loop.remaining(
-        [&](size_t x, size_t length) { vector_path_generic(x, length, dst); });
+  if constexpr (Channels == 1) {
+    if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+      auto vector_path_generic = [&](size_t x, size_t x_max,
+                                     Columns<ScalarType> dst) {
+        size_t length = x_max - x;
+        svbool_t pg32 = svwhilelt_b32(0ULL, length);
+        svuint32_t result =
+            get_pixels(pg32, calculate_nearest_coordinates(pg32, x));
+        svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
+      };
+
+      loop.unroll_four_times([&](size_t x) {
+        ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svuint32_t res32_0 =
+            get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+        x += kStep;
+        svuint32_t res32_1 =
+            get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+        svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                        svreinterpret_u16_u32(res32_1));
+        x += kStep;
+        res32_0 =
+            get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+        x += kStep;
+        res32_1 =
+            get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+        svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                        svreinterpret_u16_u32(res32_1));
+        svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0),
+                                     svreinterpret_u8_u16(result1));
+        svst1(svptrue_b8(), p_dst, result);
+      });
+      loop.unroll_once(
+          [&](size_t x) { vector_path_generic(x, x + kStep, dst); });
+      loop.remaining([&](size_t x, size_t length) {
+        vector_path_generic(x, length, dst);
+      });
+    }
+
+    if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+      auto vector_path_generic = [&](size_t x, size_t x_max,
+                                     Columns<ScalarType> dst) {
+        size_t length = x_max - x;
+        svbool_t pg32 = svwhilelt_b32(0ULL, length);
+        svuint32_t result =
+            get_pixels(pg32, calculate_nearest_coordinates(pg32, x));
+        svst1h_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
+      };
+
+      loop.unroll_twice([&](size_t x) {
+        ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svuint32_t res32_0 =
+            get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+        x += kStep;
+        svuint32_t res32_1 =
+            get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+        svuint16_t result = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                       svreinterpret_u16_u32(res32_1));
+        svst1(svptrue_b16(), p_dst, result);
+      });
+      loop.unroll_once(
+          [&](size_t x) { vector_path_generic(x, x + kStep, dst); });
+      loop.remaining([&](size_t x, size_t length) {
+        vector_path_generic(x, length, dst);
+      });
+    }
   }
 
-  if constexpr (std::is_same<ScalarType, uint16_t>::value) {
-    auto vector_path_generic = [&](size_t x, size_t x_max,
-                                   Columns<ScalarType> dst) {
-      size_t length = x_max - x;
-      svbool_t pg32 = svwhilelt_b32(0ULL, length);
-      svuint32_t result =
-          get_pixels(pg32, calculate_nearest_coordinates(pg32, x));
-      svst1h_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
-    };
-
-    loop.unroll_twice([&](size_t x) {
-      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
-      svuint32_t res32_0 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      x += kStep;
-      svuint32_t res32_1 =
-          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
-      svuint16_t result = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                     svreinterpret_u16_u32(res32_1));
-      svst1(svptrue_b16(), p_dst, result);
-    });
-    loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); });
-    loop.remaining(
-        [&](size_t x, size_t length) { vector_path_generic(x, length, dst); });
+  if constexpr (Channels == 2) {
+    if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+      auto vector_path_generic = [&](size_t x, size_t x_max,
+                                     Columns<ScalarType> dst) {
+        size_t length = x_max - x;
+        svbool_t pg32 = svwhilelt_b32(0ULL, length);
+        svuint32_t result =
+            get_pixels(pg32, calculate_nearest_coordinates(pg32, x));
+        svbool_t pg16 = svwhilelt_b16(0ULL, 2 * length);
+        svst1b_u16(pg16, dst.ptr_at(static_cast<ptrdiff_t>(x)),
+                   svreinterpret_u16_u32(result));
+      };
+
+      loop.unroll_twice([&](size_t x) {
+        ScalarType* p_dst = dst.ptr_at(static_cast<ptrdiff_t>(x));
+        svuint32_t result0 =
+            get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+        x += kStep;
+        svuint32_t result1 =
+            get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+        svuint8_t result = svuzp1_u8(svreinterpret_u8_u32(result0),
+                                     svreinterpret_u8_u32(result1));
+        svst1(svptrue_b8(), p_dst, result);
+      });
+      loop.unroll_once(
+          [&](size_t x) { vector_path_generic(x, x + kStep, dst); });
+      loop.remaining([&](size_t x, size_t length) {
+        vector_path_generic(x, length, dst);
+      });
+    }
+
+    if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+      loop.unroll_once([&](size_t x) {
+        svuint16_t result = svreinterpret_u16_u32(
+            get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)));
+        svst1_u16(pg_all16, dst.ptr_at(static_cast<ptrdiff_t>(x)), result);
+      });
+      loop.remaining([&](size_t x, size_t x_max) {
+        svbool_t pg32 = svwhilelt_b32(x, x_max);
+        svuint16_t result = svreinterpret_u16_u32(
+            get_pixels(pg32, calculate_nearest_coordinates(pg32, x)));
+        svbool_t pg16 = svwhilelt_b16(2 * x, 2 * x_max);
+        svst1_u16(pg16, dst.ptr_at(static_cast<ptrdiff_t>(x)), result);
+      });
+    }
   }
 }
 
-template <typename ScalarType, bool IsLarge, kleidicv_border_type_t Border>
+template <typename ScalarType, bool IsLarge, kleidicv_border_type_t Border,
+          size_t Channels>
 void remap_f32_linear(svuint32_t sv_xmax, svuint32_t sv_ymax,
                       svfloat32_t sv_xmaxf, svfloat32_t sv_ymaxf,
                       svuint32_t sv_src_stride, Rows<const ScalarType> src_rows,
                       svuint32_t sv_border, Columns<ScalarType> dst,
                       size_t kStep, size_t dst_width,
-                      Rows<const float> mapx_rows,
-                      Rows<const float> mapy_rows) {
+                      Rows<const float> mapx_rows, Rows<const float> mapy_rows,
+                      svuint8_t load_table_2ch) {
   auto load_coords = [&](svbool_t pg, size_t xs) {
     auto x = static_cast<ptrdiff_t>(xs);
     return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]),
@@ -156,13 +219,15 @@ void remap_f32_linear(svuint32_t sv_xmax, svuint32_t sv_ymax,
   auto calculate_linear = [&](svbool_t pg, uint32_t x) {
     if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
       svfloat32x2_t coords = load_coords(pg, x);
-      return calculate_linear_replicated_border<ScalarType, IsLarge>(
-          pg, coords, sv_xmaxf, sv_ymaxf, sv_src_stride, src_rows);
+      return calculate_linear_replicated_border<ScalarType, IsLarge, Channels>(
+          pg, coords, sv_xmaxf, sv_ymaxf, sv_src_stride, src_rows,
+          load_table_2ch);
     } else {
       static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
       svfloat32x2_t coords = load_coords(pg, x);
-      return calculate_linear_constant_border<ScalarType, IsLarge>(
-          pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
+      return calculate_linear_constant_border<ScalarType, IsLarge, Channels>(
+          pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows,
+          load_table_2ch);
     }
   };
 
@@ -177,48 +242,94 @@ void remap_f32_linear(svuint32_t sv_xmax, svuint32_t sv_ymax,
 
   svbool_t pg_all32 = svptrue_b32();
   LoopUnroll2 loop{dst_width, kStep};
-  if constexpr (std::is_same<ScalarType, uint8_t>::value) {
-    loop.unroll_four_times([&](size_t x) {
-      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
-      svuint32_t res0 = calculate_linear(pg_all32, x);
-      x += kStep;
-      svuint32_t res1 = calculate_linear(pg_all32, x);
-      svuint16_t result16_0 =
-          svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1));
-      x += kStep;
-      res0 = calculate_linear(pg_all32, x);
-      x += kStep;
-      res1 = calculate_linear(pg_all32, x);
-      svuint16_t result16_1 =
-          svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1));
-      svst1_u8(svptrue_b8(), p_dst,
-               svuzp1_u8(svreinterpret_u8_u16(result16_0),
-                         svreinterpret_u8_u16(result16_1)));
+  if constexpr (Channels == 1) {
+    if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+      loop.unroll_four_times([&](size_t x) {
+        ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svuint32_t res0 = calculate_linear(pg_all32, x);
+        x += kStep;
+        svuint32_t res1 = calculate_linear(pg_all32, x);
+        svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0),
+                                           svreinterpret_u16_u32(res1));
+        x += kStep;
+        res0 = calculate_linear(pg_all32, x);
+        x += kStep;
+        res1 = calculate_linear(pg_all32, x);
+        svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0),
+                                           svreinterpret_u16_u32(res1));
+        svst1_u8(svptrue_b8(), p_dst,
+                 svuzp1_u8(svreinterpret_u8_u16(result16_0),
+                           svreinterpret_u8_u16(result16_1)));
+      });
+    }
+    if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+      loop.unroll_twice([&](size_t x) {
+        ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svuint32_t res0 = calculate_linear(pg_all32, x);
+        x += kStep;
+        svuint32_t res1 = calculate_linear(pg_all32, x);
+        svuint16_t result16 = svuzp1_u16(svreinterpret_u16_u32(res0),
+                                         svreinterpret_u16_u32(res1));
+        svst1_u16(svptrue_b16(), p_dst, result16);
+      });
+    }
+    loop.unroll_once([&](size_t x) {
+      svuint32_t result = calculate_linear(pg_all32, x);
+      store_vector(pg_all32, &dst[static_cast<ptrdiff_t>(x)], result);
     });
-  } else if constexpr (std::is_same<ScalarType, uint16_t>::value) {
-    loop.unroll_twice([&](size_t x) {
-      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
-      svuint32_t res0 = calculate_linear(pg_all32, x);
-      x += kStep;
-      svuint32_t res1 = calculate_linear(pg_all32, x);
-      svuint16_t result16 =
-          svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1));
-      svst1_u16(svptrue_b16(), p_dst, result16);
+    loop.remaining([&](size_t x, size_t x_max) {
+      svbool_t pg32 = svwhilelt_b32(x, x_max);
+      svuint32_t result = calculate_linear(pg32, x);
+      store_vector(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
     });
   }
-  loop.unroll_once([&](size_t x) {
-    svuint32_t result = calculate_linear(pg_all32, x);
-    store_vector(pg_all32, &dst[static_cast<ptrdiff_t>(x)], result);
-  });
-  loop.remaining([&](size_t x, size_t x_max) {
-    svbool_t pg32 = svwhilelt_b32(x, x_max);
-    svuint32_t result = calculate_linear(pg32, x);
-    store_vector(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
-  });
+
+  if constexpr (Channels == 2) {
+    if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+      auto vector_path_generic = [&](size_t x, size_t x_max,
+                                     Columns<ScalarType> dst) {
+        size_t length = x_max - x;
+        svbool_t pg32 = svwhilelt_b32(0ULL, length);
+        svuint32_t result = calculate_linear(pg32, x);
+        svbool_t pg16 = svwhilelt_b16(0ULL, 2 * length);
+        svst1b_u16(pg16, dst.ptr_at(static_cast<ptrdiff_t>(x)),
+                   svreinterpret_u16_u32(result));
+      };
+
+      loop.unroll_twice([&](size_t x) {
+        ScalarType* p_dst = dst.ptr_at(static_cast<ptrdiff_t>(x));
+        svuint32_t result0 = calculate_linear(pg_all32, x);
+        x += kStep;
+        svuint32_t result1 = calculate_linear(pg_all32, x);
+        svuint8_t result = svuzp1_u8(svreinterpret_u8_u32(result0),
+                                     svreinterpret_u8_u32(result1));
+        svst1(svptrue_b8(), p_dst, result);
+      });
+      loop.unroll_once(
+          [&](size_t x) { vector_path_generic(x, x + kStep, dst); });
+      loop.remaining([&](size_t x, size_t length) {
+        vector_path_generic(x, length, dst);
+      });
+    }
+    if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+      loop.unroll_once([&](size_t x) {
+        svuint16_t result =
+            svreinterpret_u16_u32(calculate_linear(pg_all32, x));
+        svst1_u16(svptrue_b16(), dst.ptr_at(static_cast<ptrdiff_t>(x)), result);
+      });
+      loop.remaining([&](size_t x, size_t x_max) {
+        svbool_t pg32 = svwhilelt_b32(x, x_max);
+        svuint16_t result = svreinterpret_u16_u32(calculate_linear(pg32, x));
+        svbool_t pg16 = svwhilelt_b16(2 * x, 2 * x_max);
+        svst1_u16(pg16, dst.ptr_at(static_cast<ptrdiff_t>(x)), result);
+      });
+    }
+  }
 }
 
 template <typename ScalarType, bool IsLarge,
-          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
+          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border,
+          size_t Channels>
 void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
                          size_t src_height, const ScalarType* border_value,
                          Rows<ScalarType> dst_rows, size_t dst_width,
@@ -231,7 +342,14 @@ void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
   svuint32_t sv_border = svdup_n_u32(0);
 
   if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
-    sv_border = svdup_n_u32(border_value[0]);
+    if constexpr (Channels == 1) {
+      sv_border = svdup_n_u32(border_value[0]);
+    }
+    if constexpr (Channels == 2) {
+      uint32_t v = static_cast<uint32_t>(border_value[0]) |
+                   (static_cast<uint32_t>(border_value[1]) << 16);
+      sv_border = svdup_n_u32(v);
+    }
   }
 
   svfloat32_t sv_xmaxf = svdup_n_f32(static_cast<float>(src_width - 1));
@@ -239,17 +357,25 @@ void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
 
   const size_t kStep = VecTraits<float>::num_lanes();
 
+  // Rearrange input for 8bit 2channel:
+  // Gather Load 16bits, 2x 8bits for 2 channels:
+  // after 32-bit gather load:        ..DC..BA
+  // goal is to have 16-bit elements: .D.C.B.A
+  svuint8_t load_table_2ch =
+      svreinterpret_u8_u32(svindex_u32(0x03010200U, 0x04040404));
+
   for (size_t y = y_begin; y < y_end; ++y) {
     Columns<ScalarType> dst = dst_rows.as_columns();
     if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
-      remap_f32_nearest<ScalarType, IsLarge, Border>(
+      remap_f32_nearest<ScalarType, IsLarge, Border, Channels>(
           sv_xmax, sv_ymax, sv_src_stride, src_rows, sv_border, dst, kStep,
-          dst_width, mapx_rows, mapy_rows);
+          dst_width, mapx_rows, mapy_rows, load_table_2ch);
     } else {
       static_assert(Inter == KLEIDICV_INTERPOLATION_LINEAR);
-      remap_f32_linear<ScalarType, IsLarge, Border>(
+      remap_f32_linear<ScalarType, IsLarge, Border, Channels>(
           sv_xmax, sv_ymax, sv_xmaxf, sv_ymaxf, sv_src_stride, src_rows,
-          sv_border, dst, kStep, dst_width, mapx_rows, mapy_rows);
+          sv_border, dst, kStep, dst_width, mapx_rows, mapy_rows,
+          load_table_2ch);
     }
     ++mapx_rows;
     ++mapy_rows;
@@ -279,18 +405,11 @@ kleidicv_error_t remap_f32(const T* src, size_t src_stride, size_t src_width,
   }
 
   if (!remap_f32_is_implemented<T>(src_stride, src_width, src_height, dst_width,
-                                   border_type, channels, interpolation)) {
+                                   dst_height, border_type, channels,
+                                   interpolation)) {
     return KLEIDICV_ERROR_NOT_IMPLEMENTED;
   }
 
-  // Calculating in float32_t will only be precise until 24 bits
-  if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
-      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) ||
-      // Empty source image is not supported
-      src_width == 0 || src_height == 0) {
-    return KLEIDICV_ERROR_RANGE;
-  }
-
   Rows<const T> src_rows{src, src_stride, channels};
   Rows<const float> mapx_rows{mapx, mapx_stride, 1};
   Rows<const float> mapy_rows{mapy, mapy_stride, 1};
@@ -298,7 +417,7 @@ kleidicv_error_t remap_f32(const T* src, size_t src_stride, size_t src_width,
   Rectangle rect{dst_width, dst_height};
 
   transform_operation<T>(is_image_large(src_rows, src_height), interpolation,
-                         border_type, src_rows, src_width, src_height,
+                         border_type, channels, src_rows, src_width, src_height,
                          border_value, dst_rows, dst_width, 0, dst_height,
                          mapx_rows, mapy_rows);
 
diff --git a/kleidicv/src/transform/transform_common.h b/kleidicv/src/transform/transform_common.h
index eaf005f66..da5bbeeef 100644
--- a/kleidicv/src/transform/transform_common.h
+++ b/kleidicv/src/transform/transform_common.h
@@ -13,6 +13,24 @@ bool is_image_large(const Rows<T> &rows, size_t height) {
   return rows.stride() * height >= 1ULL << 32;
 }
 
+// Convert channels to a template argument.
+template <typename ScalarType, bool IsLarge,
+          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border,
+          typename... Args>
+void transform_operation(size_t channels, Args &&...args) {
+  switch (channels) {
+    case 1:
+      transform_operation<ScalarType, IsLarge, Inter, Border, 1UL>(
+          std::forward<Args>(args)...);
+      break;
+    case 2:
+      transform_operation<ScalarType, IsLarge, Inter, Border, 2UL>(
+          std::forward<Args>(args)...);
+    default:
+      return;
+  }
+}
+
 // Convert border_type to a template argument.
 template <typename ScalarType, bool IsLarge,
           kleidicv_interpolation_type_t Inter, typename... Args>
diff --git a/kleidicv/src/transform/transform_neon.h b/kleidicv/src/transform/transform_neon.h
index afcc72da8..ab0f5b2ca 100644
--- a/kleidicv/src/transform/transform_neon.h
+++ b/kleidicv/src/transform/transform_neon.h
@@ -2,8 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <arm_neon.h>
-
+#include "kleidicv/ctypes.h"
 #include "kleidicv/neon.h"
 #include "kleidicv/types.h"
 #include "transform_common.h"
@@ -18,33 +17,70 @@ template <typename ScalarType, bool IsLarge>
 float32x4_t inline load_xy(uint32x4_t x, uint32x4_t y, uint32x4_t v_src_stride,
                            Rows<const ScalarType>& src_rows) {
   if constexpr (IsLarge) {
-    uint64x2_t offset_low =
+    uint64x2_t indices_low =
         vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
                   vget_low_u32(v_src_stride));
-    uint64x2_t offset_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
-                                       vget_low_u32(v_src_stride));
+    uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
+                                        vget_low_u32(v_src_stride));
     uint64_t acc =
-        static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_low, 0)]) |
-        (static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_low, 1)]) << 32);
+        static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_low, 0)]) |
+        (static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_low, 1)]) << 32);
     uint64x2_t rawsrc = vdupq_n_u64(acc);
-    acc =
-        static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_high, 0)]) |
-        (static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_high, 1)]) << 32);
+    acc = static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_high, 0)]) |
+          (static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_high, 1)])
+           << 32);
     rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
     return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
   } else {
-    uint32x4_t offset = vmlaq_u32(x, y, v_src_stride);
+    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
     uint64_t acc =
-        static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 0)]) |
-        (static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 1)]) << 32);
+        static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 0)]) |
+        (static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 1)]) << 32);
     uint64x2_t rawsrc = vdupq_n_u64(acc);
-    acc = static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 2)]) |
-          (static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 3)]) << 32);
+    acc = static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 2)]) |
+          (static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 3)]) << 32);
     rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
     return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
   }
 }
 
+template <typename ScalarType, bool IsLarge>
+float32x4x2_t inline load_xy_2ch(uint32x4_t x, uint32x4_t y,
+                                 uint32x4_t v_src_stride,
+                                 Rows<const ScalarType>& src_rows) {
+  const size_t kBytes = 2 * sizeof(ScalarType);
+  ScalarType elements[4 * 2];  // 4 pixels, 2 channels
+  // Multiply x with the number of channels (2)
+  x = vshlq_n_u32(x, 1);
+  if constexpr (IsLarge) {
+    uint64x2_t indices_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                  vget_low_u32(v_src_stride));
+    uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
+                                        vget_low_u32(v_src_stride));
+    memcpy(&elements[0], &src_rows[vgetq_lane_u64(indices_low, 0)], kBytes);
+    memcpy(&elements[2], &src_rows[vgetq_lane_u64(indices_low, 1)], kBytes);
+    memcpy(&elements[4], &src_rows[vgetq_lane_u64(indices_high, 0)], kBytes);
+    memcpy(&elements[6], &src_rows[vgetq_lane_u64(indices_high, 1)], kBytes);
+  } else {
+    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
+    memcpy(&elements[0], &src_rows[vgetq_lane_u32(indices, 0)], kBytes);
+    memcpy(&elements[2], &src_rows[vgetq_lane_u32(indices, 1)], kBytes);
+    memcpy(&elements[4], &src_rows[vgetq_lane_u32(indices, 2)], kBytes);
+    memcpy(&elements[6], &src_rows[vgetq_lane_u32(indices, 3)], kBytes);
+  }
+  uint16x8_t pixels16{};
+  if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+    pixels16 = vmovl_u8(vld1_u8(elements));
+  } else if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+    pixels16 = vld1q_u16(elements);
+  }
+  float32x4x2_t result;
+  result.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(pixels16)));
+  result.val[1] = vcvtq_f32_u32(vmovl_high_u16(pixels16));
+  return result;
+}
+
 template <typename ScalarType, bool IsLarge>
 float32x4_t inline load_xy_or_border(uint32x4_t x, uint32x4_t y,
                                      uint32x4_t in_range,
@@ -52,40 +88,40 @@ float32x4_t inline load_xy_or_border(uint32x4_t x, uint32x4_t y,
                                      uint32x4_t v_src_stride,
                                      Rows<const ScalarType> src_rows) {
   if constexpr (IsLarge) {
-    uint64x2_t offset_low =
+    uint64x2_t indices_low =
         vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
                   vget_low_u32(v_src_stride));
-    uint64x2_t offset_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
-                                       vget_low_u32(v_src_stride));
+    uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
+                                        vget_low_u32(v_src_stride));
     uint64_t pixel0 = vgetq_lane_u32(in_range, 0)
-                          ? src_rows[vgetq_lane_u64(offset_low, 0)]
+                          ? src_rows[vgetq_lane_u64(indices_low, 0)]
                           : border_value;
     uint64_t pixel1 = vgetq_lane_u32(in_range, 1)
-                          ? src_rows[vgetq_lane_u64(offset_low, 1)]
+                          ? src_rows[vgetq_lane_u64(indices_low, 1)]
                           : border_value;
     uint64_t pixel2 = vgetq_lane_u32(in_range, 2)
-                          ? src_rows[vgetq_lane_u64(offset_high, 0)]
+                          ? src_rows[vgetq_lane_u64(indices_high, 0)]
                           : border_value;
     uint64_t pixel3 = vgetq_lane_u32(in_range, 3)
-                          ? src_rows[vgetq_lane_u64(offset_high, 1)]
+                          ? src_rows[vgetq_lane_u64(indices_high, 1)]
                           : border_value;
     uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
     rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
     rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1);
     return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
   } else {
-    uint32x4_t offset = vmlaq_u32(x, y, v_src_stride);
+    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
     uint64_t pixel0 = vgetq_lane_u32(in_range, 0)
-                          ? src_rows[vgetq_lane_u32(offset, 0)]
+                          ? src_rows[vgetq_lane_u32(indices, 0)]
                           : border_value;
     uint64_t pixel1 = vgetq_lane_u32(in_range, 1)
-                          ? src_rows[vgetq_lane_u32(offset, 1)]
+                          ? src_rows[vgetq_lane_u32(indices, 1)]
                           : border_value;
     uint64_t pixel2 = vgetq_lane_u32(in_range, 2)
-                          ? src_rows[vgetq_lane_u32(offset, 2)]
+                          ? src_rows[vgetq_lane_u32(indices, 2)]
                           : border_value;
     uint64_t pixel3 = vgetq_lane_u32(in_range, 3)
-                          ? src_rows[vgetq_lane_u32(offset, 3)]
+                          ? src_rows[vgetq_lane_u32(indices, 3)]
                           : border_value;
     uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
     rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32));
@@ -94,6 +130,62 @@ float32x4_t inline load_xy_or_border(uint32x4_t x, uint32x4_t y,
   }
 }
 
+template <typename ScalarType, bool IsLarge>
+float32x4x2_t inline load_xy_or_border_2ch(uint32x4_t x, uint32x4_t y,
+                                           uint32x4_t in_range,
+                                           const ScalarType* border_values,
+                                           uint32x4_t v_src_stride,
+                                           Rows<const ScalarType> src_rows) {
+  const size_t kBytes = 2 * sizeof(ScalarType);
+  const ScalarType *pixel0{}, *pixel1{}, *pixel2{}, *pixel3{};
+  ScalarType elements[4 * 2];  // 4 pixels, 2 channels
+  // Multiply x with the number of channels
+  x = vshlq_n_u32(x, 1);
+  if constexpr (IsLarge) {
+    uint64x2_t indices_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                  vget_low_u32(v_src_stride));
+    uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
+                                        vget_low_u32(v_src_stride));
+    pixel0 = vgetq_lane_u32(in_range, 0)
+                 ? &src_rows[vgetq_lane_u64(indices_low, 0)]
+                 : border_values;
+    pixel1 = vgetq_lane_u32(in_range, 1)
+                 ? &src_rows[vgetq_lane_u64(indices_low, 1)]
+                 : border_values;
+    pixel2 = vgetq_lane_u32(in_range, 2)
+                 ? &src_rows[vgetq_lane_u64(indices_high, 0)]
+                 : border_values;
+    pixel3 = vgetq_lane_u32(in_range, 3)
+                 ? &src_rows[vgetq_lane_u64(indices_high, 1)]
+                 : border_values;
+  } else {
+    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
+    pixel0 = vgetq_lane_u32(in_range, 0) ? &src_rows[vgetq_lane_u32(indices, 0)]
+                                         : border_values;
+    pixel1 = vgetq_lane_u32(in_range, 1) ? &src_rows[vgetq_lane_u32(indices, 1)]
+                                         : border_values;
+    pixel2 = vgetq_lane_u32(in_range, 2) ? &src_rows[vgetq_lane_u32(indices, 2)]
+                                         : border_values;
+    pixel3 = vgetq_lane_u32(in_range, 3) ? &src_rows[vgetq_lane_u32(indices, 3)]
+                                         : border_values;
+  }
+  memcpy(&elements[0], pixel0, kBytes);
+  memcpy(&elements[2], pixel1, kBytes);
+  memcpy(&elements[4], pixel2, kBytes);
+  memcpy(&elements[6], pixel3, kBytes);
+  uint16x8_t pixels16{};
+  if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+    pixels16 = vmovl_u8(vld1_u8(elements));
+  } else if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+    pixels16 = vld1q_u16(elements);
+  }
+  float32x4x2_t result;
+  result.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(pixels16)));
+  result.val[1] = vcvtq_f32_u32(vmovl_high_u16(pixels16));
+  return result;
+}
+
 template <typename ScalarType, bool IsLarge>
 void load_quad_pixels_replicate(FloatVectorPair xy, uint32x4_t v_xmax,
                                 uint32x4_t v_ymax, uint32x4_t v_src_stride,
@@ -124,10 +216,40 @@ void load_quad_pixels_replicate(FloatVectorPair xy, uint32x4_t v_xmax,
   d = load_xy<ScalarType, IsLarge>(x1, y1, v_src_stride, src_rows);
 }
 
+template <typename ScalarType, bool IsLarge>
+void load_quad_pixels_replicate_2ch(FloatVectorPair xy, uint32x4_t v_xmax,
+                                    uint32x4_t v_ymax, uint32x4_t v_src_stride,
+                                    Rows<const ScalarType> src_rows,
+                                    float32x4_t& xfrac, float32x4_t& yfrac,
+                                    float32x4x2_t& a, float32x4x2_t& b,
+                                    float32x4x2_t& c, float32x4x2_t& d) {
+  auto&& [xf, yf] = xy;
+  // Truncating convert to int
+  uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax);
+  uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax);
+
+  // Get fractional part, or 0 if out of range
+  float32x4_t zero = vdupq_n_f32(0.F);
+  uint32x4_t x_in_range = vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax));
+  uint32x4_t y_in_range = vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax));
+  xfrac = vsubq_f32(xf, vrndmq_f32(xf));
+  yfrac = vsubq_f32(yf, vrndmq_f32(yf));
+
+  // x1 = x0 + 1, except if it's already xmax or out of range
+  uint32x4_t x1 = vsubq_u32(x0, x_in_range);
+  uint32x4_t y1 = vsubq_u32(y0, y_in_range);
+
+  // a: top left, b: top right, c: bottom left, d: bottom right
+  a = load_xy_2ch<ScalarType, IsLarge>(x0, y0, v_src_stride, src_rows);
+  b = load_xy_2ch<ScalarType, IsLarge>(x1, y0, v_src_stride, src_rows);
+  c = load_xy_2ch<ScalarType, IsLarge>(x0, y1, v_src_stride, src_rows);
+  d = load_xy_2ch<ScalarType, IsLarge>(x1, y1, v_src_stride, src_rows);
+}
+
 template <typename ScalarType, bool IsLarge>
 void load_quad_pixels_constant(FloatVectorPair xy, uint32x4_t v_xmax,
                                uint32x4_t v_ymax, uint32x4_t v_src_stride,
-                               ScalarType border_value,
+                               const ScalarType* border_values,
                                Rows<const ScalarType> src_rows,
                                float32x4_t& xfrac, float32x4_t& yfrac,
                                float32x4_t& a, float32x4_t& b, float32x4_t& c,
@@ -155,14 +277,55 @@ void load_quad_pixels_constant(FloatVectorPair xy, uint32x4_t v_xmax,
     c_in_range = vandq(x0_in_range, y1_in_range);
     d_in_range = vandq(x1_in_range, y1_in_range);
   }
-  a = load_xy_or_border<ScalarType, IsLarge>(x0, y0, a_in_range, border_value,
-                                             v_src_stride, src_rows);
-  b = load_xy_or_border<ScalarType, IsLarge>(x1, y0, b_in_range, border_value,
-                                             v_src_stride, src_rows);
-  c = load_xy_or_border<ScalarType, IsLarge>(x0, y1, c_in_range, border_value,
-                                             v_src_stride, src_rows);
-  d = load_xy_or_border<ScalarType, IsLarge>(x1, y1, d_in_range, border_value,
-                                             v_src_stride, src_rows);
+  a = load_xy_or_border<ScalarType, IsLarge>(
+      x0, y0, a_in_range, border_values[0], v_src_stride, src_rows);
+  b = load_xy_or_border<ScalarType, IsLarge>(
+      x1, y0, b_in_range, border_values[0], v_src_stride, src_rows);
+  c = load_xy_or_border<ScalarType, IsLarge>(
+      x0, y1, c_in_range, border_values[0], v_src_stride, src_rows);
+  d = load_xy_or_border<ScalarType, IsLarge>(
+      x1, y1, d_in_range, border_values[0], v_src_stride, src_rows);
+}
+
+template <typename ScalarType, bool IsLarge>
+void load_quad_pixels_constant_2ch(FloatVectorPair xy, uint32x4_t v_xmax,
+                                   uint32x4_t v_ymax, uint32x4_t v_src_stride,
+                                   const ScalarType* border_values,
+                                   Rows<const ScalarType> src_rows,
+                                   float32x4_t& xfrac, float32x4_t& yfrac,
+                                   float32x4x2_t& a, float32x4x2_t& b,
+                                   float32x4x2_t& c, float32x4x2_t& d) {
+  auto&& [xf, yf] = xy;
+  // Convert coordinates to integers, truncating towards minus infinity.
+  // Negative numbers will become large positive numbers.
+  // Since the source width and height is known to be <=2^24 these large
+  // positive numbers will always be treated as outside the source image
+  // bounds.
+  uint32x4_t x0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(xf));
+  uint32x4_t y0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(yf));
+  uint32x4_t x1 = vaddq(x0, vdupq_n_u32(1));
+  uint32x4_t y1 = vaddq(y0, vdupq_n_u32(1));
+  xfrac = vsubq_f32(xf, vrndmq_f32(xf));
+  yfrac = vsubq_f32(yf, vrndmq_f32(yf));
+  uint32x4_t a_in_range, b_in_range, c_in_range, d_in_range;
+  {
+    uint32x4_t x0_in_range = vcleq_u32(x0, v_xmax);
+    uint32x4_t y0_in_range = vcleq_u32(y0, v_ymax);
+    uint32x4_t x1_in_range = vcleq_u32(x1, v_xmax);
+    uint32x4_t y1_in_range = vcleq_u32(y1, v_ymax);
+    a_in_range = vandq(x0_in_range, y0_in_range);
+    b_in_range = vandq(x1_in_range, y0_in_range);
+    c_in_range = vandq(x0_in_range, y1_in_range);
+    d_in_range = vandq(x1_in_range, y1_in_range);
+  }
+  a = load_xy_or_border_2ch<ScalarType, IsLarge>(
+      x0, y0, a_in_range, border_values, v_src_stride, src_rows);
+  b = load_xy_or_border_2ch<ScalarType, IsLarge>(
+      x1, y0, b_in_range, border_values, v_src_stride, src_rows);
+  c = load_xy_or_border_2ch<ScalarType, IsLarge>(
+      x0, y1, c_in_range, border_values, v_src_stride, src_rows);
+  d = load_xy_or_border_2ch<ScalarType, IsLarge>(
+      x1, y1, d_in_range, border_values, v_src_stride, src_rows);
 }
 
 inline uint32x4_t lerp_2d(float32x4_t xfrac, float32x4_t yfrac, float32x4_t a,
@@ -173,10 +336,10 @@ inline uint32x4_t lerp_2d(float32x4_t xfrac, float32x4_t yfrac, float32x4_t a,
   return vcvtaq_u32_f32(result);
 }
 
-template <typename ScalarType, bool IsLarge>
+template <typename ScalarType, bool IsLarge, size_t Channels>
 void transform_pixels_replicate(float32x4_t xf, float32x4_t yf,
                                 uint32x4_t v_xmax, uint32x4_t v_ymax,
-                                uint32x4_t v_src_stride,
+                                uint32x4_t v_src_element_stride,
                                 Rows<const ScalarType> src_rows,
                                 Columns<ScalarType> dst) {
   // Round to nearest, with Ties To Away (i.e. round 0.5 up)
@@ -184,34 +347,71 @@ void transform_pixels_replicate(float32x4_t xf, float32x4_t yf,
   // (vcvtaq already converted negative values to 0)
   uint32x4_t x = vminq_u32(vcvtaq_u32_f32(xf), v_xmax);
   uint32x4_t y = vminq_u32(vcvtaq_u32_f32(yf), v_ymax);
-
+  if constexpr (Channels == 2) {
+    // Multiply x with the number of channels
+    x = vshlq_n_u32(x, 1);
+  }
   // Copy pixels from source
   if constexpr (IsLarge) {
     uint64x2_t indices_low =
         vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
-                  vget_low_u32(v_src_stride));
+                  vget_low_u32(v_src_element_stride));
     uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
-                                        vget_low_u32(v_src_stride));
-    dst[0] = src_rows[vgetq_lane_u64(indices_low, 0)];
-    dst[1] = src_rows[vgetq_lane_u64(indices_low, 1)];
-    dst[2] = src_rows[vgetq_lane_u64(indices_high, 0)];
-    dst[3] = src_rows[vgetq_lane_u64(indices_high, 1)];
+                                        vget_low_u32(v_src_element_stride));
+    if constexpr (Channels == 1) {
+      dst[0] = src_rows[vgetq_lane_u64(indices_low, 0)];
+      dst[1] = src_rows[vgetq_lane_u64(indices_low, 1)];
+      dst[2] = src_rows[vgetq_lane_u64(indices_high, 0)];
+      dst[3] = src_rows[vgetq_lane_u64(indices_high, 1)];
+    } else {
+      const size_t kBytes = Channels * sizeof(ScalarType);
+      memcpy(dst.ptr_at(0), &src_rows[vgetq_lane_u64(indices_low, 0)], kBytes);
+      memcpy(dst.ptr_at(1), &src_rows[vgetq_lane_u64(indices_low, 1)], kBytes);
+      memcpy(dst.ptr_at(2), &src_rows[vgetq_lane_u64(indices_high, 0)], kBytes);
+      memcpy(dst.ptr_at(3), &src_rows[vgetq_lane_u64(indices_high, 1)], kBytes);
+    }
   } else {
-    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
-    dst[0] = src_rows[vgetq_lane_u32(indices, 0)];
-    dst[1] = src_rows[vgetq_lane_u32(indices, 1)];
-    dst[2] = src_rows[vgetq_lane_u32(indices, 2)];
-    dst[3] = src_rows[vgetq_lane_u32(indices, 3)];
+    uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride);
+    if constexpr (Channels == 1) {
+      dst[0] = src_rows[vgetq_lane_u32(indices, 0)];
+      dst[1] = src_rows[vgetq_lane_u32(indices, 1)];
+      dst[2] = src_rows[vgetq_lane_u32(indices, 2)];
+      dst[3] = src_rows[vgetq_lane_u32(indices, 3)];
+    } else {
+      const size_t kBytes = Channels * sizeof(ScalarType);
+      memcpy(dst.ptr_at(0), &src_rows[vgetq_lane_u32(indices, 0)], kBytes);
+      memcpy(dst.ptr_at(1), &src_rows[vgetq_lane_u32(indices, 1)], kBytes);
+      memcpy(dst.ptr_at(2), &src_rows[vgetq_lane_u32(indices, 2)], kBytes);
+      memcpy(dst.ptr_at(3), &src_rows[vgetq_lane_u32(indices, 3)], kBytes);
+    }
   }
 }
 
-template <typename ScalarType, bool IsLarge>
+template <size_t Lane, typename ScalarType>
+static const ScalarType* get_src_or_border_small(
+    uint32x4_t in_range, Rows<const ScalarType> src_rows, uint32x4_t indices,
+    const ScalarType* border_values) {
+  return vgetq_lane_u32(in_range, Lane)
+             ? &src_rows[vgetq_lane_u32(indices, Lane)]
+             : border_values;
+}
+
+template <size_t Lane, typename ScalarType>
+static const ScalarType* get_src_or_border_large(
+    uint32x4_t in_range, Rows<const ScalarType> src_rows, uint64x2_t indices,
+    const ScalarType* border_values) {
+  return vgetq_lane_u32(in_range, Lane)
+             ? &src_rows[vgetq_lane_u64(indices, Lane % 2)]
+             : border_values;
+}
+
+template <typename ScalarType, bool IsLarge, size_t Channels>
 void transform_pixels_constant(float32x4_t xf, float32x4_t yf,
                                uint32x4_t v_xmax, uint32x4_t v_ymax,
-                               uint32x4_t v_src_stride,
+                               uint32x4_t v_src_element_stride,
                                Rows<const ScalarType> src_rows,
                                Columns<ScalarType> dst,
-                               ScalarType border_value) {
+                               const ScalarType* border_values) {
   // Convert coordinates to integers.
   // Negative numbers will become large positive numbers.
   // Since the source width and height is known to be <=2^24 these large
@@ -222,34 +422,88 @@ void transform_pixels_constant(float32x4_t xf, float32x4_t yf,
   uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax));
 
   // Copy pixels from source
-  if constexpr (IsLarge) {
-    uint64x2_t indices_low =
-        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
-                  vget_low_u32(v_src_stride));
-    uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
-                                        vget_low_u32(v_src_stride));
-    dst[0] = vgetq_lane_u32(in_range, 0)
-                 ? src_rows[vgetq_lane_u64(indices_low, 0)]
-                 : border_value;
-    dst[1] = vgetq_lane_u32(in_range, 1)
-                 ? src_rows[vgetq_lane_u64(indices_low, 1)]
-                 : border_value;
-    dst[2] = vgetq_lane_u32(in_range, 2)
-                 ? src_rows[vgetq_lane_u64(indices_high, 0)]
-                 : border_value;
-    dst[3] = vgetq_lane_u32(in_range, 3)
-                 ? src_rows[vgetq_lane_u64(indices_high, 1)]
-                 : border_value;
+  if constexpr (Channels == 1) {
+    if constexpr (IsLarge) {
+      uint64x2_t indices_low =
+          vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                    vget_low_u32(v_src_element_stride));
+      uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
+                                          vget_low_u32(v_src_element_stride));
+      dst[0] = *get_src_or_border_large<0>(in_range, src_rows, indices_low,
+                                           border_values);
+      dst[1] = *get_src_or_border_large<1>(in_range, src_rows, indices_low,
+                                           border_values);
+      dst[2] = *get_src_or_border_large<2>(in_range, src_rows, indices_high,
+                                           border_values);
+      dst[3] = *get_src_or_border_large<3>(in_range, src_rows, indices_high,
+                                           border_values);
+    } else {
+      uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride);
+      dst[0] = *get_src_or_border_small<0>(in_range, src_rows, indices,
+                                           border_values);
+      dst[1] = *get_src_or_border_small<1>(in_range, src_rows, indices,
+                                           border_values);
+      dst[2] = *get_src_or_border_small<2>(in_range, src_rows, indices,
+                                           border_values);
+      dst[3] = *get_src_or_border_small<3>(in_range, src_rows, indices,
+                                           border_values);
+    }
+  } else {  // Channels > 1
+    const size_t kBytes = Channels * sizeof(ScalarType);
+    const ScalarType *pixel0{}, *pixel1{}, *pixel2{}, *pixel3{};
+    // Multiply x with the number of channels
+    if constexpr (Channels == 2) {
+      x = vshlq_n_u32(x, 1);
+    } else {
+      x = vmulq_n_u32(x, Channels);
+    }
+    if constexpr (IsLarge) {
+      uint64x2_t indices_low =
+          vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                    vget_low_u32(v_src_element_stride));
+      uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y),
+                                          vget_low_u32(v_src_element_stride));
+      pixel0 = get_src_or_border_large<0>(in_range, src_rows, indices_low,
+                                          border_values);
+      pixel1 = get_src_or_border_large<1>(in_range, src_rows, indices_low,
+                                          border_values);
+      pixel2 = get_src_or_border_large<2>(in_range, src_rows, indices_high,
+                                          border_values);
+      pixel3 = get_src_or_border_large<3>(in_range, src_rows, indices_high,
+                                          border_values);
+
+    } else {
+      uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride);
+      pixel0 = get_src_or_border_small<0>(in_range, src_rows, indices,
+                                          border_values);
+      pixel1 = get_src_or_border_small<1>(in_range, src_rows, indices,
+                                          border_values);
+      pixel2 = get_src_or_border_small<2>(in_range, src_rows, indices,
+                                          border_values);
+      pixel3 = get_src_or_border_small<3>(in_range, src_rows, indices,
+                                          border_values);
+    }
+    memcpy(dst.ptr_at(0), pixel0, kBytes);
+    memcpy(dst.ptr_at(1), pixel1, kBytes);
+    memcpy(dst.ptr_at(2), pixel2, kBytes);
+    memcpy(dst.ptr_at(3), pixel3, kBytes);
+  }
+}
+
+template <typename ScalarType, bool IsLarge, size_t Channels,
+          kleidicv_border_type_t Border>
+void transform_pixels(float32x4_t xf, float32x4_t yf, uint32x4_t v_xmax,
+                      uint32x4_t v_ymax, uint32x4_t v_src_element_stride,
+                      Rows<const ScalarType> src_rows, Columns<ScalarType> dst,
+                      [[maybe_unused]] const ScalarType* border_values) {
+  if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
+    transform_pixels_replicate<ScalarType, IsLarge, Channels>(
+        xf, yf, v_xmax, v_ymax, v_src_element_stride, src_rows, dst);
   } else {
-    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride);
-    dst[0] = vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u32(indices, 0)]
-                                         : border_value;
-    dst[1] = vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u32(indices, 1)]
-                                         : border_value;
-    dst[2] = vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u32(indices, 2)]
-                                         : border_value;
-    dst[3] = vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u32(indices, 3)]
-                                         : border_value;
+    static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
+    transform_pixels_constant<ScalarType, IsLarge, Channels>(
+        xf, yf, v_xmax, v_ymax, v_src_element_stride, src_rows, dst,
+        border_values);
   }
 }
 
diff --git a/kleidicv/src/transform/transform_sve2.h b/kleidicv/src/transform/transform_sve2.h
index 74b6d889a..b9cb5ed46 100644
--- a/kleidicv/src/transform/transform_sve2.h
+++ b/kleidicv/src/transform/transform_sve2.h
@@ -2,21 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <arm_sve.h>
-
-#include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
-#include <limits>
-#include <memory>
-#include <utility>
 
-#include "kleidicv/ctypes.h"
-#include "kleidicv/kleidicv.h"
 #include "kleidicv/sve2.h"
-#include "kleidicv/traits.h"
 #include "kleidicv/types.h"
 #include "transform_common.h"
 
@@ -26,56 +17,115 @@ template <typename ScalarType, bool IsLarge>
 svuint32_t inline load_xy(svbool_t pg, svuint32_t x, svuint32_t y,
                           svuint32_t sv_src_stride,
                           Rows<const ScalarType> &src_rows) {
-  if constexpr (std::is_same<ScalarType, uint8_t>::value) {
-    if constexpr (IsLarge) {
-      svbool_t pg_b = pg;
-      svbool_t pg_t = svtrn2_b32(pg, svpfalse());
+  if constexpr (IsLarge) {
+    svbool_t pg_b = pg;
+    svbool_t pg_t = svtrn2_b32(pg, svpfalse());
 
-      // Calculate offsets from coordinates (y * stride + x)
-      // To avoid losing precision, the final offsets should be in 64 bits
+    // Calculate offsets from coordinates (y * stride + x)
+    // To avoid losing precision, the final offsets should be in 64 bits
+    svuint64_t result_b, result_t;
+    if constexpr (std::is_same<ScalarType, uint8_t>::value) {
       svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride);
       svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride);
       // Copy pixels from source
-      svuint64_t result_b =
-          svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
-      svuint64_t result_t =
-          svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
-      return svtrn1_u32(svreinterpret_u32_u64(result_b),
-                        svreinterpret_u32_u64(result_t));
-    } else {
-      svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
-      return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets);
+      result_b = svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
+      result_t = svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
     }
-  } else if constexpr (std::is_same<ScalarType, uint16_t>::value) {
-    if constexpr (IsLarge) {
-      svbool_t pg_b = pg;
-      svbool_t pg_t = svtrn2_b32(pg, svpfalse());
-      // Calculate offsets from coordinates (y * stride + x)
-      // To avoid losing precision, the final offsets should be in 64 bits
+    if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+      // Multiply x with sizeof(uint16_t)
       svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride);
       svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride);
       // Copy pixels from source
-      svuint64_t result_b, result_t;
       result_b = svld1uh_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
       result_t = svld1uh_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
-      return svtrn1_u32(svreinterpret_u32_u64(result_b),
-                        svreinterpret_u32_u64(result_t));
+    }
+    return svtrn1_u32(svreinterpret_u32_u64(result_b),
+                      svreinterpret_u32_u64(result_t));
+  } else {
+    if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+      svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
+      return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets);
     } else {
-      svuint32_t offsets =
-          svmla_x(pg, svlsl_n_u32_x(pg, x, 1), y, sv_src_stride);
+      // Multiply by sizeof(uint16_t)
+      x = svlsl_x(pg, x, 1);
+      svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
       return svld1uh_gather_offset_u32(pg, &src_rows[0], offsets);
     }
   }
 }
 
 template <typename ScalarType, bool IsLarge>
+svuint32_t inline load_xy_2ch(svbool_t pg, svuint32_t x, svuint32_t y,
+                              svuint32_t sv_src_stride,
+                              Rows<const ScalarType> &src_rows,
+                              [[maybe_unused]] svuint8_t load_table) {
+  if constexpr (IsLarge) {
+    svbool_t pg_b = pg;
+    svbool_t pg_t = svtrn2_b32(pg, svpfalse());
+
+    // Calculate offsets from coordinates (y * stride + x)
+    // To avoid losing precision, the final offsets should be in 64 bits
+    if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+      // Multiply x with the number of channels
+      svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride);
+      svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride);
+      // Copy pixels from source
+      svuint64_t b = svld1uh_gather_offset_u64(
+          pg_b, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_b);
+      svuint64_t t = svld1uh_gather_offset_u64(
+          pg_t, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_t);
+      svuint32_t r32 =
+          svtrn1_u32(svreinterpret_u32_u64(b), svreinterpret_u32_u64(t));
+      return svreinterpret_u32_u8(
+          svtbl_u8(svreinterpret_u8_u32(r32), load_table));
+    }
+    if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+      // Multiply x with the number of channels and sizeof(uint16_t)
+      svuint64_t offsets_b = svmlalb(svshllb(x, 2), y, sv_src_stride);
+      svuint64_t offsets_t = svmlalt(svshllt(x, 2), y, sv_src_stride);
+      // Copy pixels from source
+      svuint64_t result_b = svld1uw_gather_offset_u64(
+          pg_b, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_b);
+      svuint64_t result_t = svld1uw_gather_offset_u64(
+          pg_t, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_t);
+      return svtrn1_u32(svreinterpret_u32_u64(result_b),
+                        svreinterpret_u32_u64(result_t));
+    }
+  } else {
+    // Multiply x with the number of channels and sizeof(ScalarType)
+    // This shifting formula is only correct for 8 and 16 bits
+    x = svlsl_n_u32_x(pg, x, sizeof(ScalarType));
+    svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
+    if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+      svuint32_t r32 = svld1uh_gather_offset_u32(
+          pg, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets);
+      return svreinterpret_u32_u8(
+          svtbl_u8(svreinterpret_u8_u32(r32), load_table));
+    }
+    if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+      return svld1_gather_u32offset_u32(
+          pg, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets);
+    }
+  }
+}
+
+template <typename ScalarType, bool IsLarge, size_t Channels>
 svuint32_t inline calculate_linear_replicated_border(
     svbool_t pg, svfloat32x2_t coords, svfloat32_t xmaxf, svfloat32_t ymaxf,
-    svuint32_t sv_src_stride, Rows<const ScalarType> &src_rows) {
+    svuint32_t sv_src_stride, Rows<const ScalarType> &src_rows,
+    svuint8_t load_table_2ch) {
+  svbool_t pg_all32 = svptrue_b32();
+
   auto load_source = [&](svuint32_t x, svuint32_t y) {
-    return load_xy<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows);
+    if constexpr (Channels == 1) {
+      return load_xy<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows);
+    }
+    if constexpr (Channels == 2) {
+      return load_xy_2ch<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows,
+                                              load_table_2ch);
+    }
   };
-  svbool_t pg_all32 = svptrue_b32();
+
   svfloat32_t xf = svget2(coords, 0);
   svfloat32_t yf = svget2(coords, 1);
   // Take the integer part, clamp it to within the dimensions of the
@@ -99,19 +149,43 @@ svuint32_t inline calculate_linear_replicated_border(
   svuint32_t x1 = svsel_u32(x_in_range, svadd_n_u32_x(pg_all32, x0, 1), x0);
   svuint32_t y1 = svsel_u32(y_in_range, svadd_n_u32_x(pg_all32, y0, 1), y0);
 
+  auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci,
+                     svuint32_t di) {
+    svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai);
+    svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi);
+    svfloat32_t line0 =
+        svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
+    svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci);
+    svfloat32_t d = svcvt_f32_u32_x(pg_all32, di);
+    svfloat32_t line1 =
+        svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
+    svfloat32_t result = svmla_f32_x(
+        pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
+    return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F));
+  };
+
   // Calculate offsets from coordinates (y * stride + x)
   // a: top left, b: top right, c: bottom left, d: bottom right
-  svfloat32_t a = svcvt_f32_u32_x(pg_all32, load_source(x0, y0));
-  svfloat32_t b = svcvt_f32_u32_x(pg_all32, load_source(x1, y0));
-  svfloat32_t line0 =
-      svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
-  svfloat32_t c = svcvt_f32_u32_x(pg_all32, load_source(x0, y1));
-  svfloat32_t d = svcvt_f32_u32_x(pg_all32, load_source(x1, y1));
-  svfloat32_t line1 =
-      svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
-  svfloat32_t result =
-      svmla_f32_x(pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
-  return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F));
+  svuint32_t a = load_source(x0, y0);
+  svuint32_t b = load_source(x1, y0);
+  svuint32_t c = load_source(x0, y1);
+  svuint32_t d = load_source(x1, y1);
+  if constexpr (Channels == 1) {
+    return lerp_2d(a, b, c, d);
+  }
+  if constexpr (Channels == 2) {
+    // Channel 0
+    svuint32_t res32_0 = lerp_2d(
+        svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)),
+        svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d)));
+    // Channel 1
+    svuint32_t res32_1 = lerp_2d(
+        svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)),
+        svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d)));
+
+    return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0),
+                                            svreinterpret_u16_u32(res32_1)));
+  }
 }
 
 template <typename ScalarType, bool IsLarge>
@@ -128,10 +202,39 @@ svuint32_t get_pixels_or_border(svbool_t pg, svuint32_t x, svuint32_t y,
 }
 
 template <typename ScalarType, bool IsLarge>
+svuint32_t get_pixels_or_border_2ch(svbool_t pg, svuint32_t x, svuint32_t y,
+                                    svuint32_t sv_border, svuint32_t sv_xmax,
+                                    svuint32_t sv_ymax,
+                                    svuint32_t sv_src_stride,
+                                    Rows<const ScalarType> &src_rows,
+                                    svuint8_t load_table) {
+  svbool_t in_range =
+      svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
+  svuint32_t result = load_xy_2ch<ScalarType, IsLarge>(
+      in_range, x, y, sv_src_stride, src_rows, load_table);
+  // Select between source pixels and border colour
+  return svsel_u32(in_range, result, sv_border);
+}
+
+template <typename ScalarType, bool IsLarge, size_t Channels>
 svuint32_t inline calculate_linear_constant_border(
     svbool_t pg, svfloat32x2_t coords, svuint32_t sv_border, svuint32_t sv_xmax,
     svuint32_t sv_ymax, svuint32_t sv_src_stride,
-    Rows<const ScalarType> &src_rows) {
+    Rows<const ScalarType> &src_rows, svuint8_t load_table_2ch) {
+  svbool_t pg_all32 = svptrue_b32();
+
+  auto load_source = [&](svuint32_t x, svuint32_t y) {
+    if constexpr (Channels == 1) {
+      return get_pixels_or_border<ScalarType, IsLarge>(
+          pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
+    }
+    if constexpr (Channels == 2) {
+      return get_pixels_or_border_2ch<ScalarType, IsLarge>(
+          pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows,
+          load_table_2ch);
+    }
+  };
+
   // Convert coordinates to integers, truncating towards minus infinity.
   // Negative numbers will become large positive numbers.
   // Since the source width and height is known to be <=2^24 these large
@@ -153,23 +256,43 @@ svuint32_t inline calculate_linear_constant_border(
     yfrac = svsub_f32_x(pg, yf, yf0);
   }
 
-  svfloat32_t a = svcvt_f32_u32_x(pg, get_pixels_or_border<ScalarType, IsLarge>(
-                                          pg, x0, y0, sv_border, sv_xmax,
-                                          sv_ymax, sv_src_stride, src_rows));
-  svfloat32_t b = svcvt_f32_u32_x(pg, get_pixels_or_border<ScalarType, IsLarge>(
-                                          pg, x1, y0, sv_border, sv_xmax,
-                                          sv_ymax, sv_src_stride, src_rows));
-  svfloat32_t line0 = svmla_f32_x(pg, a, svsub_f32_x(pg, b, a), xfrac);
-  svfloat32_t c = svcvt_f32_u32_x(pg, get_pixels_or_border<ScalarType, IsLarge>(
-                                          pg, x0, y1, sv_border, sv_xmax,
-                                          sv_ymax, sv_src_stride, src_rows));
-  svfloat32_t d = svcvt_f32_u32_x(pg, get_pixels_or_border<ScalarType, IsLarge>(
-                                          pg, x1, y1, sv_border, sv_xmax,
-                                          sv_ymax, sv_src_stride, src_rows));
-  svfloat32_t line1 = svmla_f32_x(pg, c, svsub_f32_x(pg, d, c), xfrac);
-  svfloat32_t result =
-      svmla_f32_x(pg, line0, svsub_f32_x(pg, line1, line0), yfrac);
-  return svcvt_u32_f32_x(pg, svrinta_f32_x(pg, result));
+  auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci,
+                     svuint32_t di) {
+    svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai);
+    svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi);
+    svfloat32_t line0 =
+        svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
+    svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci);
+    svfloat32_t d = svcvt_f32_u32_x(pg_all32, di);
+    svfloat32_t line1 =
+        svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
+    svfloat32_t result = svmla_f32_x(
+        pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
+    return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F));
+  };
+
+  // Calculate offsets from coordinates (y * stride + x)
+  // a: top left, b: top right, c: bottom left, d: bottom right
+  svuint32_t a = load_source(x0, y0);
+  svuint32_t b = load_source(x1, y0);
+  svuint32_t c = load_source(x0, y1);
+  svuint32_t d = load_source(x1, y1);
+  if constexpr (Channels == 1) {
+    return lerp_2d(a, b, c, d);
+  }
+  if constexpr (Channels == 2) {
+    // Channel 0
+    svuint32_t res32_0 = lerp_2d(
+        svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)),
+        svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d)));
+    // Channel 1
+    svuint32_t res32_1 = lerp_2d(
+        svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)),
+        svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d)));
+
+    return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0),
+                                            svreinterpret_u16_u32(res32_1)));
+  }
 }
 
 }  // namespace kleidicv::sve2
diff --git a/kleidicv/src/transform/warp_perspective_neon.cpp b/kleidicv/src/transform/warp_perspective_neon.cpp
index dd7494523..e76d14493 100644
--- a/kleidicv/src/transform/warp_perspective_neon.cpp
+++ b/kleidicv/src/transform/warp_perspective_neon.cpp
@@ -1,9 +1,7 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <arm_neon.h>
-
 #include <cassert>
 
 #include "kleidicv/ctypes.h"
@@ -34,10 +32,11 @@ namespace kleidicv::neon {
 //
 
 template <typename ScalarType, bool IsLarge,
-          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
+          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border,
+          size_t Channels>
 void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
                          size_t src_height, const float transform[9],
-                         const ScalarType *border_value,
+                         const ScalarType *border_values,
                          Rows<ScalarType> dst_rows, size_t dst_width,
                          size_t y_begin, size_t y_end) {
   static constexpr uint32_t first_few_x[] = {0, 1, 2, 3};
@@ -73,8 +72,8 @@ void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
     } else {
       static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
       load_quad_pixels_constant<ScalarType, IsLarge>(
-          calculate_coordinates(x), v_xmax, v_ymax, v_src_stride,
-          border_value[0], src_rows, xfrac, yfrac, a, b, c, d);
+          calculate_coordinates(x), v_xmax, v_ymax, v_src_stride, border_values,
+          src_rows, xfrac, yfrac, a, b, c, d);
     }
     return lerp_2d(xfrac, yfrac, a, b, c, d);
   };
@@ -96,16 +95,16 @@ void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
       if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
         loop.unroll_once([&](size_t x) {
           auto &&[xf, yf] = calculate_coordinates(x);
-          transform_pixels_replicate<ScalarType, IsLarge>(
+          transform_pixels_replicate<ScalarType, IsLarge, Channels>(
               xf, yf, v_xmax, v_ymax, v_src_stride, src_rows, dst.at(x));
         });
       } else {
         static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
         loop.unroll_once([&](size_t x) {
           auto &&[xf, yf] = calculate_coordinates(x);
-          transform_pixels_constant<ScalarType, IsLarge>(
+          transform_pixels_constant<ScalarType, IsLarge, Channels>(
               xf, yf, v_xmax, v_ymax, v_src_stride, src_rows, dst.at(x),
-              border_value[0]);
+              border_values);
         });
       }
     } else {
@@ -168,7 +167,7 @@ kleidicv_error_t warp_perspective_stripe(
   dst_rows += y_begin;
 
   transform_operation<T>(is_image_large(src_rows, src_height), interpolation,
-                         border_type, src_rows, src_width, src_height,
+                         border_type, channels, src_rows, src_width, src_height,
                          transformation, border_value, dst_rows, dst_width,
                          y_begin, y_end);
   return KLEIDICV_OK;
diff --git a/kleidicv/src/transform/warp_perspective_sve2.cpp b/kleidicv/src/transform/warp_perspective_sve2.cpp
index b1f3df8d2..7b00b8492 100644
--- a/kleidicv/src/transform/warp_perspective_sve2.cpp
+++ b/kleidicv/src/transform/warp_perspective_sve2.cpp
@@ -33,7 +33,8 @@ namespace kleidicv::sve2 {
 //
 
 template <typename ScalarType, bool IsLarge,
-          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
+          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border,
+          size_t Channels>
 void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
                          size_t src_height, const ScalarType *border_value,
                          Rows<ScalarType> dst_rows, size_t dst_width,
@@ -160,15 +161,20 @@ void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
     svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
   };
 
+  // WarpPerspective does not implement 2 channels, so this is dummy
+  svuint8_t dummy_load_table_2ch{};
+
   auto calculate_linear = [&](svbool_t pg, uint32_t x) {
     svfloat32x2_t coords = calc_coords(pg, x);
     if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
-      return calculate_linear_replicated_border<ScalarType, IsLarge>(
-          pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows);
+      return calculate_linear_replicated_border<ScalarType, IsLarge, 1>(
+          pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows,
+          dummy_load_table_2ch);
     } else {
       static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
-      return calculate_linear_constant_border<ScalarType, IsLarge>(
-          pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
+      return calculate_linear_constant_border<ScalarType, IsLarge, 1>(
+          pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows,
+          dummy_load_table_2ch);
     }
   };
 
@@ -265,7 +271,7 @@ KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe(
   dst_rows += y_begin;
 
   transform_operation<T>(is_image_large(src_rows, src_height), interpolation,
-                         border_type, src_rows, src_width, src_height,
+                         border_type, channels, src_rows, src_width, src_height,
                          border_value, dst_rows, dst_width, y_begin, y_end,
                          transform);
 
diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp
index b8368e013..f72010f92 100644
--- a/kleidicv_thread/src/kleidicv_thread.cpp
+++ b/kleidicv_thread/src/kleidicv_thread.cpp
@@ -765,8 +765,8 @@ kleidicv_error_t kleidicv_thread_remap_f32_u8(
     kleidicv_border_type_t border_type, const uint8_t *border_value,
     kleidicv_thread_multithreading mt) {
   if (!kleidicv::remap_f32_is_implemented<uint8_t>(
-          src_stride, src_width, src_height, dst_width, border_type, channels,
-          interpolation)) {
+          src_stride, src_width, src_height, dst_width, dst_height, border_type,
+          channels, interpolation)) {
     return KLEIDICV_ERROR_NOT_IMPLEMENTED;
   }
   auto callback = [=](unsigned begin, unsigned end) {
@@ -790,8 +790,8 @@ kleidicv_error_t kleidicv_thread_remap_f32_u16(
     kleidicv_border_type_t border_type, const uint16_t *border_value,
     kleidicv_thread_multithreading mt) {
   if (!kleidicv::remap_f32_is_implemented<uint16_t>(
-          src_stride, src_width, src_height, dst_width, border_type, channels,
-          interpolation)) {
+          src_stride, src_width, src_height, dst_width, dst_height, border_type,
+          channels, interpolation)) {
     return KLEIDICV_ERROR_NOT_IMPLEMENTED;
   }
   auto callback = [=](unsigned begin, unsigned end) {
diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt
index 0625b6bd3..ca82aeda0 100755
--- a/scripts/benchmark/benchmarks.txt
+++ b/scripts/benchmark/benchmarks.txt
@@ -86,14 +86,22 @@ Remap_S16Point5_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8U
 Remap_S16Point5_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)'
 Remap_S16Point5_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)'
 Remap_S16Point5_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)'
-Remap_F32_U8_Replicate_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)'
-Remap_F32_U8_Constant_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)'
-Remap_F32_U16_Replicate_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)'
-Remap_F32_U16_Constant_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)'
 Remap_F32_U8_Replicate_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_REPLICATE)'
 Remap_F32_U8_Constant_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_CONSTANT)'
 Remap_F32_U16_Replicate_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_NEAREST, BORDER_REPLICATE)'
 Remap_F32_U16_Constant_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_NEAREST, BORDER_CONSTANT)'
+Remap_F32_U8_Replicate_Nearest_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC2, 32FC1, INTER_NEAREST, BORDER_REPLICATE)'
+Remap_F32_U8_Constant_Nearest_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC2, 32FC1, INTER_NEAREST, BORDER_CONSTANT)'
+Remap_F32_U16_Replicate_Nearest_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC2, 32FC1, INTER_NEAREST, BORDER_REPLICATE)'
+Remap_F32_U16_Constant_Nearest_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC2, 32FC1, INTER_NEAREST, BORDER_CONSTANT)'
+Remap_F32_U8_Replicate_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)'
+Remap_F32_U8_Constant_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)'
+Remap_F32_U16_Replicate_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)'
+Remap_F32_U16_Constant_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)'
+Remap_F32_U8_Replicate_Linear_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC2, 32FC1, INTER_LINEAR, BORDER_REPLICATE)'
+Remap_F32_U8_Constant_Linear_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC2, 32FC1, INTER_LINEAR, BORDER_CONSTANT)'
+Remap_F32_U16_Replicate_Linear_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC2, 32FC1, INTER_LINEAR, BORDER_REPLICATE)'
+Remap_F32_U16_Constant_Linear_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC2, 32FC1, INTER_LINEAR, BORDER_CONSTANT)'
 
 WarpPerspective_Nearest: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 1)'
 WarpPerspective_Linear: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 1)'
diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp
index d3bde250c..bea82b90b 100644
--- a/test/api/test_remap.cpp
+++ b/test/api/test_remap.cpp
@@ -37,11 +37,12 @@ static const ScalarType *get_array2d_element_or_border(
     const test::Array2D<ScalarType> &src, ptrdiff_t x, ptrdiff_t y,
     kleidicv_border_type_t border_type, const ScalarType *border_value) {
   if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) {
-    x = std::clamp<ptrdiff_t>(x, 0, static_cast<ptrdiff_t>(src.width()) - 1);
+    x = std::clamp<ptrdiff_t>(
+        x, 0, static_cast<ptrdiff_t>(src.width() / src.channels()) - 1);
     y = std::clamp<ptrdiff_t>(y, 0, static_cast<ptrdiff_t>(src.height()) - 1);
   } else {
     assert(border_type == KLEIDICV_BORDER_TYPE_CONSTANT);
-    if (x >= static_cast<ptrdiff_t>(src.width()) ||
+    if (x * src.channels() >= src.width() ||
         y >= static_cast<ptrdiff_t>(src.height()) || x < 0 || y < 0) {
       return border_value;
     }
@@ -216,7 +217,6 @@ static const auto &get_borders() {
   static const T border_value[KLEIDICV_MAXIMUM_CHANNEL_COUNT] = {4, 5, 6, 7};
   static const std::array borders{
       P{KLEIDICV_BORDER_TYPE_REPLICATE, nullptr},
-      P{KLEIDICV_BORDER_TYPE_REPLICATE, border_value},
       P{KLEIDICV_BORDER_TYPE_CONSTANT, border_value},
   };
   return borders;
@@ -970,28 +970,29 @@ class RemapF32 : public testing::Test {
     test::Array2D<ScalarType> expected{dst_total_width, dst_h, padding,
                                        channels};
 
-    // Initalize the edges only
+    // Initalize the four corners only
     const int64_t kMaxVal = std::numeric_limits<ScalarType>::max() * 3 / 4;
     const int64_t kMinVal =
         std::numeric_limits<ScalarType>::lowest() + kMaxVal / 3;
     auto generateSource = [&](size_t x, size_t y) {
       return static_cast<ScalarType>((x + y) % 2 ? kMaxVal : kMinVal);
     };
-    for (size_t y = 0; y < src_h; ++y) {
+    for (size_t y = 0; y < 2; ++y) {
       *source.at(y, 0) = generateSource(y, 0);
       *source.at(y, 1) = generateSource(y, 1);
       *source.at(y, 2) = generateSource(y, 2);
       *source.at(y, src_w - 3) = generateSource(y, src_w - 3);
       *source.at(y, src_w - 2) = generateSource(y, src_w - 2);
       *source.at(y, src_w - 1) = generateSource(y, src_w - 1);
-    }
-    for (size_t x = 0; x < src_w; ++x) {
-      *source.at(0, x) = generateSource(0, x);
-      *source.at(1, x) = generateSource(1, x);
-      *source.at(2, x) = generateSource(2, x);
-      *source.at(src_h - 3, x) = generateSource(src_h - 3, x);
-      *source.at(src_h - 2, x) = generateSource(src_h - 2, x);
-      *source.at(src_h - 1, x) = generateSource(src_h - 1, x);
+      *source.at(src_h - y - 1, 0) = generateSource(src_h - y - 1, 0);
+      *source.at(src_h - y - 1, 1) = generateSource(src_h - y - 1, 1);
+      *source.at(src_h - y - 1, 2) = generateSource(src_h - y - 1, 2);
+      *source.at(src_h - y - 1, src_w - 3) =
+          generateSource(src_h - y - 1, src_w - 3);
+      *source.at(src_h - y - 1, src_w - 2) =
+          generateSource(src_h - y - 1, src_w - 2);
+      *source.at(src_h - y - 1, src_w - 1) =
+          generateSource(src_h - y - 1, src_w - 1);
     }
 
     test::PseudoRandomNumberGenerator<ScalarType> generator;
@@ -1000,13 +1001,12 @@ class RemapF32 : public testing::Test {
     calculate_expected(source, mapx, mapy, border_type, border_value,
                        interpolation, expected);
 
-    ASSERT_EQ(
-        KLEIDICV_OK,
-        remap_f32<ScalarType>()(
-            source.data(), source.stride(), source.width(), source.height(),
-            actual.data(), actual.stride(), actual.width(), actual.height(),
-            channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(),
-            interpolation, border_type, border_value));
+    ASSERT_EQ(KLEIDICV_OK,
+              remap_f32<ScalarType>()(
+                  source.data(), source.stride(), src_w, source.height(),
+                  actual.data(), actual.stride(), dst_w, actual.height(),
+                  channels, mapx.data(), mapx.stride(), mapy.data(),
+                  mapy.stride(), interpolation, border_type, border_value));
 
     if (expected.compare_to(actual, 1)) {
       if (source.width() < 100 && source.height() < 100) {
@@ -1048,13 +1048,12 @@ class RemapF32 : public testing::Test {
     calculate_expected(source, mapx, mapy, border_type, border_value,
                        interpolation, expected);
 
-    ASSERT_EQ(
-        KLEIDICV_OK,
-        remap_f32<ScalarType>()(
-            source.data(), source.stride(), source.width(), source.height(),
-            actual.data(), actual.stride(), actual.width(), actual.height(),
-            channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(),
-            interpolation, border_type, border_value));
+    ASSERT_EQ(KLEIDICV_OK,
+              remap_f32<ScalarType>()(
+                  source.data(), source.stride(), src_w, source.height(),
+                  actual.data(), actual.stride(), dst_w, actual.height(),
+                  channels, mapx.data(), mapx.stride(), mapy.data(),
+                  mapy.stride(), interpolation, border_type, border_value));
 
     if (expected.compare_to(actual, 1)) {
       if (source.width() < 100 && source.height() < 100) {
@@ -1142,14 +1141,15 @@ TYPED_TEST(RemapF32, RandomNoPadding) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  size_t channels = 1;
   size_t padding = 0;
-  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    for (auto interpolation :
-         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
-      TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels,
-                               border_type, border_value, interpolation,
-                               padding);
+  for (size_t channels = 1; channels <= 2; ++channels) {
+    for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+      for (auto interpolation :
+           {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+        TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels,
+                                 border_type, border_value, interpolation,
+                                 padding);
+      }
     }
   }
 }
@@ -1159,13 +1159,15 @@ TYPED_TEST(RemapF32, BlendPadding) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  size_t channels = 1;
   size_t padding = 13;
-  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    for (auto interpolation :
-         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
-      TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type,
-                              border_value, interpolation, padding);
+  for (size_t channels = 2; channels <= 2; ++channels) {
+    for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+      for (auto interpolation :
+           {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+        TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels,
+                                border_type, border_value, interpolation,
+                                padding);
+      }
     }
   }
 }
@@ -1175,14 +1177,15 @@ TYPED_TEST(RemapF32, OutsideRandomPadding) {
   size_t src_h = 4;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  size_t channels = 1;
   size_t padding = 13;
-  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    for (auto interpolation :
-         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
-      TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels,
-                                       border_type, border_value, interpolation,
-                                       padding);
+  for (size_t channels = 1; channels <= 2; ++channels) {
+    for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+      for (auto interpolation :
+           {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+        TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels,
+                                         border_type, border_value,
+                                         interpolation, padding);
+      }
     }
   }
 }
@@ -1192,13 +1195,15 @@ TYPED_TEST(RemapF32, BlendBigStride) {
   size_t src_h = 2;
   size_t dst_w = src_w;
   size_t dst_h = src_h;
-  size_t channels = 1;
   size_t padding = 1 << 16;
-  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    for (auto interpolation :
-         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
-      TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type,
-                              border_value, interpolation, padding);
+  for (size_t channels = 1; channels <= 2; ++channels) {
+    for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+      for (auto interpolation :
+           {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+        TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels,
+                                border_type, border_value, interpolation,
+                                padding);
+      }
     }
   }
 }
@@ -1208,14 +1213,15 @@ TYPED_TEST(RemapF32, CornerCases) {
   size_t src_h = (1ULL << 12) - 1;
   size_t dst_w = 4;
   size_t dst_h = 3 * test::Options::vector_lanes<TypeParam>() - 1;
-  size_t channels = 1;
   size_t padding = 17;
-  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    for (auto interpolation :
-         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
-      TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels,
-                                     border_type, border_value, interpolation,
-                                     padding);
+  for (size_t channels = 1; channels <= 2; ++channels) {
+    for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+      for (auto interpolation :
+           {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+        TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels,
+                                       border_type, border_value, interpolation,
+                                       padding);
+      }
     }
   }
 }
@@ -1227,14 +1233,15 @@ TYPED_TEST(RemapF32, CornerCasesLargeLoad) {
   size_t src_h = 1ULL << 14;
   size_t dst_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
   size_t dst_h = 4;
-  size_t channels = 1;
   size_t padding = 1;
-  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    for (auto interpolation :
-         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
-      TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels,
-                                     border_type, border_value, interpolation,
-                                     padding);
+  for (size_t channels = 1; channels <= 2; ++channels) {
+    for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+      for (auto interpolation :
+           {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+        TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels,
+                                       border_type, border_value, interpolation,
+                                       padding);
+      }
     }
   }
 }
@@ -1249,17 +1256,18 @@ TYPED_TEST(RemapF32, NullPointer) {
   const size_t dst_stride = dst_width * element_size;
   const TypeParam src[4] = {};
   TypeParam dst[1];
-  const size_t channels = 1;
   float mapx[1] = {};
   const size_t mapx_stride = dst_width * sizeof(float);
   float mapy[1] = {};
   const size_t mapy_stride = dst_width * sizeof(float);
   const TypeParam border_value[1] = {};
-  test::test_null_args(remap_f32<TypeParam>(), src, src_stride, src_width,
-                       src_height, dst, dst_stride, dst_width, dst_height,
-                       channels, mapx, mapx_stride, mapy, mapy_stride,
-                       KLEIDICV_INTERPOLATION_LINEAR,
-                       KLEIDICV_BORDER_TYPE_CONSTANT, border_value);
+  for (size_t channels = 1; channels <= 2; ++channels) {
+    test::test_null_args(remap_f32<TypeParam>(), src, src_stride, src_width,
+                         src_height, dst, dst_stride, dst_width, dst_height,
+                         channels, mapx, mapx_stride, mapy, mapy_stride,
+                         KLEIDICV_INTERPOLATION_LINEAR,
+                         KLEIDICV_BORDER_TYPE_CONSTANT, border_value);
+  }
 }
 
 TYPED_TEST(RemapF32, ZeroHeightImage) {
@@ -1288,7 +1296,7 @@ TYPED_TEST(RemapF32, ZeroHeightImage) {
                                      border_type, border_value));
   }
   const TypeParam border_value[1] = {0};
-  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
             remap_f32<TypeParam>()(
                 src, src_stride, kW, 0, dst, dst_stride, kW, 1, 1, mapx,
                 mapx_stride, mapy, mapy_stride, KLEIDICV_INTERPOLATION_LINEAR,
@@ -1333,13 +1341,13 @@ TYPED_TEST(RemapF32, InvalidImageSize) {
                              KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
 }
 
-TYPED_TEST(RemapF32, UnsupportedTwoChannels) {
+TYPED_TEST(RemapF32, UnsupportedThreeChannels) {
   const size_t element_size = sizeof(TypeParam);
   const TypeParam src[1] = {};
   TypeParam dst[16];
   float mapx[16] = {};
   float mapy[16] = {};
-  const size_t channels = 2;
+  const size_t channels = 3;
 
   EXPECT_EQ(
       KLEIDICV_ERROR_NOT_IMPLEMENTED,
@@ -1388,7 +1396,7 @@ TYPED_TEST(RemapF32, UnsupportedBigSourceWidth) {
   float mapx[16] = {};
   float mapy[16] = {};
 
-  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
             remap_f32<TypeParam>()(src, element_size, 1ULL << 24, 1, dst,
                                    16 * element_size, 16, 1, 1, mapx,
                                    16 * sizeof(float), mapy, 16 * sizeof(float),
@@ -1410,7 +1418,7 @@ TYPED_TEST(RemapF32, UnsupportedBigSourceHeight) {
   float mapx[16] = {};
   float mapy[16] = {};
 
-  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
             remap_f32<TypeParam>()(src, element_size, 1, 1ULL << 24, dst,
                                    16 * element_size, 16, 1, 1, mapx,
                                    16 * sizeof(float), mapy, 16 * sizeof(float),
@@ -1433,7 +1441,7 @@ TYPED_TEST(RemapF32, UnsupportedBigDestinationWidth) {
   float mapy[16] = {};
 
   EXPECT_EQ(
-      KLEIDICV_ERROR_RANGE,
+      KLEIDICV_ERROR_NOT_IMPLEMENTED,
       remap_f32<TypeParam>()(src, element_size, 1, 1, dst, 16 * element_size,
                              1ULL << 24, 1, 1, mapx, 16 * sizeof(float), mapy,
                              16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR,
@@ -1448,7 +1456,7 @@ TYPED_TEST(RemapF32, UnsupportedBigDestinationHeight) {
   float mapy[16] = {};
 
   EXPECT_EQ(
-      KLEIDICV_ERROR_RANGE,
+      KLEIDICV_ERROR_NOT_IMPLEMENTED,
       remap_f32<TypeParam>()(src, element_size, 1, 1, dst, 16 * element_size,
                              16, 1ULL << 24, 1, mapx, 16 * sizeof(float), mapy,
                              16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR,
diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp
index 128cf7af7..9a2b9affa 100644
--- a/test/api/test_thread.cpp
+++ b/test/api/test_thread.cpp
@@ -766,6 +766,9 @@ TEST_P(Thread, remap_f32_u8_border_replicate) {
   check_remap_f32<uint8_t>(kleidicv_remap_f32_u8, kleidicv_thread_remap_f32_u8,
                            1, KLEIDICV_INTERPOLATION_LINEAR,
                            KLEIDICV_BORDER_TYPE_REPLICATE, nullptr);
+  check_remap_f32<uint8_t>(kleidicv_remap_f32_u8, kleidicv_thread_remap_f32_u8,
+                           2, KLEIDICV_INTERPOLATION_LINEAR,
+                           KLEIDICV_BORDER_TYPE_REPLICATE, nullptr);
 }
 
 TEST_P(Thread, remap_f32_u8_border_constant) {
@@ -773,12 +776,18 @@ TEST_P(Thread, remap_f32_u8_border_constant) {
   check_remap_f32<uint8_t>(kleidicv_remap_f32_u8, kleidicv_thread_remap_f32_u8,
                            1, KLEIDICV_INTERPOLATION_LINEAR,
                            KLEIDICV_BORDER_TYPE_CONSTANT, &border_value);
+  check_remap_f32<uint8_t>(kleidicv_remap_f32_u8, kleidicv_thread_remap_f32_u8,
+                           2, KLEIDICV_INTERPOLATION_LINEAR,
+                           KLEIDICV_BORDER_TYPE_CONSTANT, &border_value);
 }
 
 TEST_P(Thread, remap_f32_u8_not_implemented) {
   const uint8_t border_value = 0;
   check_remap_f32_not_implemented<uint8_t>(
-      kleidicv_thread_remap_f32_u8, 2, KLEIDICV_INTERPOLATION_LINEAR,
+      kleidicv_thread_remap_f32_u8, 3, KLEIDICV_INTERPOLATION_LINEAR,
+      KLEIDICV_BORDER_TYPE_REPLICATE, &border_value);
+  check_remap_f32_not_implemented<uint8_t>(
+      kleidicv_thread_remap_f32_u8, 4, KLEIDICV_INTERPOLATION_LINEAR,
       KLEIDICV_BORDER_TYPE_REPLICATE, &border_value);
   check_remap_f32_not_implemented<uint8_t>(
       kleidicv_thread_remap_f32_u8, 1, KLEIDICV_INTERPOLATION_LINEAR,
@@ -789,6 +798,9 @@ TEST_P(Thread, remap_f32_u16_border_replicate) {
   check_remap_f32<uint16_t>(
       kleidicv_remap_f32_u16, kleidicv_thread_remap_f32_u16, 1,
       KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr);
+  check_remap_f32<uint16_t>(
+      kleidicv_remap_f32_u16, kleidicv_thread_remap_f32_u16, 2,
+      KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr);
 }
 
 TEST_P(Thread, remap_f32_u16_border_constant) {
@@ -797,12 +809,19 @@ TEST_P(Thread, remap_f32_u16_border_constant) {
                             kleidicv_thread_remap_f32_u16, 1,
                             KLEIDICV_INTERPOLATION_LINEAR,
                             KLEIDICV_BORDER_TYPE_CONSTANT, &border_value);
+  check_remap_f32<uint16_t>(kleidicv_remap_f32_u16,
+                            kleidicv_thread_remap_f32_u16, 2,
+                            KLEIDICV_INTERPOLATION_LINEAR,
+                            KLEIDICV_BORDER_TYPE_CONSTANT, &border_value);
 }
 
 TEST_P(Thread, remap_f32_u16_not_implemented) {
   const uint16_t border_value = 0;
   check_remap_f32_not_implemented<uint16_t>(
-      kleidicv_thread_remap_f32_u16, 2, KLEIDICV_INTERPOLATION_LINEAR,
+      kleidicv_thread_remap_f32_u16, 3, KLEIDICV_INTERPOLATION_LINEAR,
+      KLEIDICV_BORDER_TYPE_REPLICATE, &border_value);
+  check_remap_f32_not_implemented<uint16_t>(
+      kleidicv_thread_remap_f32_u16, 4, KLEIDICV_INTERPOLATION_LINEAR,
       KLEIDICV_BORDER_TYPE_REPLICATE, &border_value);
   check_remap_f32_not_implemented<uint16_t>(
       kleidicv_thread_remap_f32_u16, 1, KLEIDICV_INTERPOLATION_LINEAR,
-- 
GitLab