From 8aaafef0f9f48381f9df701baa9e5359189c3f58 Mon Sep 17 00:00:00 2001
From: Richard Wells <richard.wells@arm.com>
Date: Tue, 18 Feb 2025 15:06:45 +0000
Subject: [PATCH] Implement 4-channel Remap16Point5 replicate border.

---
 CHANGELOG.md                                  |  13 +-
 adapters/opencv/kleidicv_hal.cpp              |  16 +-
 conformity/opencv/test_remap.cpp              |  31 +-
 doc/functionality.md                          |  14 +-
 doc/opencv.md                                 |   4 +-
 kleidicv/include/kleidicv/kleidicv.h          |   4 +-
 kleidicv/include/kleidicv/transform/remap.h   |   6 +-
 .../src/transform/remap_s16point5_neon.cpp    | 576 +++++++++++++-----
 .../src/transform/remap_s16point5_sve2.cpp    | 386 +++++++++++-
 scripts/benchmark/benchmarks.txt              |   2 +
 test/api/test_remap.cpp                       | 251 +++++---
 test/api/test_thread.cpp                      |  12 +
 12 files changed, 1031 insertions(+), 284 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ffaf5232d..828d90092 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,13 +17,14 @@ This changelog aims to follow the guiding principles of
 
 ### Added
 - Implementation of Rotate 90 degrees clockwise.
-- Remap implementations with
-  - Integer coordinates with nearest neighbour method
+- Remap implementations for u8 and u16 images
+  - Integer coordinates with nearest neighbour method (1 channel only)
+    - Replicated and constant borders
   - Fixed-point coordinates with linear interpolation
-  - Floating-point coordinates with nearest neighbour and linear interpolation
-  - Replicated and constant borders
-  - 1 channel source image for integer and fixed-points coordinates, 1 and 2 channels for floating-point coordinates
-  - u8 and u16 images
+    - 1 channel with Replicated and constant borders
+    - 4 channels with Replicated borders only
+  - Floating-point coordinates with nearest neighbour and linear interpolation (1 and 2 channels)
+    - Replicated and constant borders
 - WarpPerspective implementation
   - Nearest and Linear interpolation method, for 1-channel u8 input.
 
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index 8a897d1e4..8525450a4 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -1354,24 +1354,24 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step,
 
   auto mt = get_multithreading();
 
-  if (src_type == CV_8UC1) {
+  if (CV_MAT_DEPTH(src_type) == CV_8U) {
     auto border_value = get_border_value<uint8_t>(border_value_f64);
     return convert_error(kleidicv_thread_remap_s16point5_u8(
         reinterpret_cast<const uint8_t *>(src_data), src_step,
         static_cast<size_t>(src_width), static_cast<size_t>(src_height),
         reinterpret_cast<uint8_t *>(dst_data), dst_step,
-        static_cast<size_t>(dst_width), static_cast<size_t>(dst_height), 1,
-        mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type,
-        border_value.data(), mt));
-  } else if (src_type == CV_16UC1) {
+        static_cast<size_t>(dst_width), static_cast<size_t>(dst_height),
+        CV_MAT_CN(src_type), mapxy, mapxy_step, mapfrac, mapfrac_step,
+        kleidicv_border_type, border_value.data(), mt));
+  } else if (CV_MAT_DEPTH(src_type) == CV_16U) {
     auto border_value = get_border_value<uint16_t>(border_value_f64);
     return convert_error(kleidicv_thread_remap_s16point5_u16(
         reinterpret_cast<const uint16_t *>(src_data), src_step,
         static_cast<size_t>(src_width), static_cast<size_t>(src_height),
         reinterpret_cast<uint16_t *>(dst_data), dst_step,
-        static_cast<size_t>(dst_width), static_cast<size_t>(dst_height), 1,
-        mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type,
-        border_value.data(), mt));
+        static_cast<size_t>(dst_width), static_cast<size_t>(dst_height),
+        CV_MAT_CN(src_type), mapxy, mapxy_step, mapfrac, mapfrac_step,
+        kleidicv_border_type, border_value.data(), mt));
   }
 
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp
index f80e1ab3d..566caf88f 100644
--- a/conformity/opencv/test_remap.cpp
+++ b/conformity/opencv/test_remap.cpp
@@ -12,8 +12,8 @@
 
 const int kMaxHeight = 36, kMaxWidth = 32;
 
-template <class ScalarType>
-static cv::Mat get_source_mat(int format) {
+template <class ScalarType, int Channels>
+cv::Mat get_source_mat(int format) {
   auto generate_source = [&]() {
     cv::Mat m(kMaxHeight, kMaxWidth, format);
     const int64_t kMaxValue = std::numeric_limits<ScalarType>::max();
@@ -21,9 +21,13 @@ static cv::Mat get_source_mat(int format) {
       for (size_t column = 0; column < kMaxWidth; ++column) {
         // Create as many different differences between neighbouring pixels as
         // possible
-        size_t counter = row + column;
-        m.at<ScalarType>(row, column) =
-            (counter % 2) ? kMaxValue : (counter % (kMaxValue + 1));
+        cv::Vec<ScalarType, Channels> pixel_value;
+        for (size_t ch = 0; ch < Channels; ++ch) {
+          size_t counter = row + column + ch;
+          pixel_value[ch] =
+              (counter % 2) ? kMaxValue : (counter % (kMaxValue + 1));
+        }
+        m.at<cv::Vec<ScalarType, Channels>>(row, column) = pixel_value;
       }
     }
     return m;
@@ -36,7 +40,7 @@ static cv::Mat get_source_mat(int format) {
 template <class ScalarType, int Format, int Interpolation, int BorderMode,
           int BorderValue>
 cv::Mat exec_remap_s16(cv::Mat& mapxy_mat) {
-  cv::Mat source_mat = get_source_mat<ScalarType>(Format);
+  cv::Mat source_mat = get_source_mat<ScalarType, CV_MAT_CN(Format)>(Format);
   cv::Mat result(mapxy_mat.rows, mapxy_mat.cols, Format);
   cv::Mat empty;
   remap(source_mat, result, mapxy_mat, empty, Interpolation, BorderMode,
@@ -49,12 +53,13 @@ template <class ScalarType, int Format, int Interpolation, int BorderMode,
           int BorderValue>
 bool test_remap_s16(int index, RecreatedMessageQueue& request_queue,
                     RecreatedMessageQueue& reply_queue) {
-  cv::Mat source_mat = get_source_mat<ScalarType>(Format);
+  cv::Mat source_mat = get_source_mat<ScalarType, CV_MAT_CN(Format)>(Format);
   cv::RNG rng(0);
 
   for (size_t w = 5; w <= kMaxWidth; w += 3) {
     for (size_t h = 5; h <= kMaxHeight; h += 2) {
-      cv::Mat source_mat = get_source_mat<ScalarType>(Format);
+      cv::Mat source_mat =
+          get_source_mat<ScalarType, CV_MAT_CN(Format)>(Format);
       cv::Mat mapxy_mat(w, h, CV_16SC2);
       rng.fill(mapxy_mat, cv::RNG::UNIFORM, -3, kMaxWidth + 3);
 
@@ -89,7 +94,7 @@ cv::Mat exec_remap_s16point5(cv::Mat& map_mat) {
   ushort* p_frac = map_mat.rowRange(height, map_mat.rows).ptr<ushort>();
   cv::Mat mapfrac_mat{height, map_mat.cols, CV_16UC1, p_frac};
   cv::Mat result(mapxy_mat.rows, mapxy_mat.cols, Format);
-  cv::Mat source_mat = get_source_mat<ScalarType>(Format);
+  cv::Mat source_mat = get_source_mat<ScalarType, CV_MAT_CN(Format)>(Format);
   remap(source_mat, result, mapxy_mat, mapfrac_mat, Interpolation, BorderMode,
         BorderValue / 1000.0);
   return result;
@@ -100,7 +105,7 @@ template <class ScalarType, int Format, int Interpolation, int BorderMode,
           int BorderValue>
 bool test_remap_s16point5(int index, RecreatedMessageQueue& request_queue,
                           RecreatedMessageQueue& reply_queue) {
-  cv::Mat source_mat = get_source_mat<ScalarType>(Format);
+  cv::Mat source_mat = get_source_mat<ScalarType, CV_MAT_CN(Format)>(Format);
   cv::RNG rng(0);
 
   for (int w = 5; w <= kMaxWidth; ++w) {
@@ -138,7 +143,7 @@ bool test_remap_s16point5(int index, RecreatedMessageQueue& request_queue,
 template <class ScalarType, int Format, int Interpolation, int BorderMode,
           int BorderValue>
 cv::Mat exec_remap_f32(cv::Mat& mapxy_mat) {
-  cv::Mat source_mat = get_source_mat<ScalarType>(Format);
+  cv::Mat source_mat = get_source_mat<ScalarType, CV_MAT_CN(Format)>(Format);
   cv::Mat result(mapxy_mat.rows, mapxy_mat.cols, Format);
 
   cv::Mat mapx_mat = mapxy_mat.rowRange(0, mapxy_mat.rows / 2);
@@ -154,7 +159,7 @@ template <class ScalarType, int Format, int Interpolation, int BorderMode,
           int BorderValue>
 bool test_remap_f32(int index, RecreatedMessageQueue& request_queue,
                     RecreatedMessageQueue& reply_queue) {
-  cv::Mat source_mat = get_source_mat<ScalarType>(Format);
+  cv::Mat source_mat = get_source_mat<ScalarType, CV_MAT_CN(Format)>(Format);
   cv::RNG rng(0);
 
   for (size_t w = 5; w <= kMaxWidth * 2; w += 3) {
@@ -250,7 +255,9 @@ std::vector<test>& remap_tests_get() {
     TEST("RemapS16 uint16 Constant", (test_remap_s16<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>), (exec_remap_s16<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>)),
 
     TEST("RemapS16Point5 uint8 Replicate", (test_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapS16Point5 uint8 Replicate 4ch", (test_remap_s16point5<uint8_t, CV_8UC4, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_s16point5<uint8_t, CV_8UC4, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
     TEST("RemapS16Point5 uint16 Replicate", (test_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapS16Point5 uint16 Replicate 4ch", (test_remap_s16point5<uint16_t, CV_16UC4, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_s16point5<uint16_t, CV_16UC4, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
     TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
     TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
 
diff --git a/doc/functionality.md b/doc/functionality.md
index 3d178946f..62bc84170 100644
--- a/doc/functionality.md
+++ b/doc/functionality.md
@@ -93,12 +93,14 @@ See `doc/opencv.md` for details of the functionality available in OpenCV.
 | 8x8         |     |  x  |
 
 # Remap
-|                                                   |  1ch u8 | 1ch u16 | 2ch u8 | 2ch u16 |
-|---------------------------------------------------|---------|---------|--------|---------|
-| Remap int16 coordinates                           |    x    |    x    |        |         |
-| Remap int16+uint16 fixed-point coordinates        |    x    |    x    |        |         |
-| Remap float32 coordinates - nearest interpolation |    x    |    x    |    x   |    x    |
-| Remap float32 coordinates - linear interpolation  |    x    |    x    |    x   |    x    |
+|                                                   |  1ch u8 | 1ch u16 | 2ch u8 | 2ch u16 | 4ch u8 | 4ch u16 |
+|---------------------------------------------------|---------|---------|--------|---------|--------|---------|
+| Remap int16 coordinates                           |    x    |    x    |        |         |        |         |
+| Remap int16+uint16 fixed-point coordinates        |    x    |    x    |        |         |   R    |    R    |
+| Remap float32 coordinates - nearest interpolation |    x    |    x    |    x   |    x    |        |         |
+| Remap float32 coordinates - linear interpolation  |    x    |    x    |    x   |    x    |        |         |
+
+R = Replicated borders only
 
 # WarpPerspective
 |                      |  u8 |
diff --git a/doc/opencv.md b/doc/opencv.md
index dfc154e09..f70aab86f 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -208,7 +208,9 @@ Geometrically transforms the `src` image by taking the pixels specified by the c
 Notes on parameters:
 * `src.step` - must be less than 65536 * element size.
 * `src.width`, `src_height` - must not be greater than 32768.
-* `src.type()` - supports `CV_8UC1` and `CV_16UC1`. With `CV_32FC1` map config, it supports `CV_8UC2` and `CV_16UC2` as well.
+* `src.type()` - supports `CV_8UC1` and `CV_16UC1` with all map configs
+  * additionally, with `CV_32FC1` map config, it supports `CV_8UC2` and `CV_16UC2` as well.
+  * additionally, with `CV_16SC2` plus `CV_16UC1` map config and `BORDER_REPLICATE`, it supports `CV_8UC4` and `CV_16UC4`
 * `dst.cols` - must be at least 4 (32FC1-type maps) or 8 (16SC2-type maps)
 * `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT`.
 
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index 6a5ff2658..2c4e1dff5 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1786,7 +1786,9 @@ KLEIDICV_API_DECLARATION(kleidicv_in_range_f32, const float *src,
 ///                     start of the next row for the destination data.
 ///                     Must be a multiple of `sizeof(int16_t)` and no less than
 ///                     `width * sizeof(int16_t)`, except for single-row images.
-/// @param channels      Number of channels in the data. Must be 1.
+/// @param channels      Number of channels in the data: \n
+///                         - Must be 1 for constant border.
+//                          - Must be 1 or 4 for replicate border.
 /// @param border_type   Way of handling the border. The supported border types
 ///                      are: \n
 ///                         - @ref KLEIDICV_BORDER_TYPE_CONSTANT
diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h
index ea72f5666..6f807c4b7 100644
--- a/kleidicv/include/kleidicv/transform/remap.h
+++ b/kleidicv/include/kleidicv/transform/remap.h
@@ -38,13 +38,15 @@ inline bool remap_s16point5_is_implemented(
     size_t channels) KLEIDICV_STREAMING_COMPATIBLE {
   if constexpr (std::is_same<T, uint8_t>::value ||
                 std::is_same<T, uint16_t>::value) {
-    return (src_stride / sizeof(T) <= std::numeric_limits<uint16_t>::max() &&
+    return (src_stride / sizeof(T) <=
+                (std::numeric_limits<uint16_t>::max() / channels) &&
             dst_width >= 8 &&
             src_width <= std::numeric_limits<int16_t>::max() + 1 &&
             src_height <= std::numeric_limits<int16_t>::max() + 1 &&
             (border_type == KLEIDICV_BORDER_TYPE_REPLICATE ||
              border_type == KLEIDICV_BORDER_TYPE_CONSTANT) &&
-            channels == 1);
+            (channels == 1 ||
+             (channels == 4 && border_type == KLEIDICV_BORDER_TYPE_REPLICATE)));
   } else {
     return false;
   }
diff --git a/kleidicv/src/transform/remap_s16point5_neon.cpp b/kleidicv/src/transform/remap_s16point5_neon.cpp
index 27893a0a3..32562c519 100644
--- a/kleidicv/src/transform/remap_s16point5_neon.cpp
+++ b/kleidicv/src/transform/remap_s16point5_neon.cpp
@@ -3,9 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <cassert>
-#include <type_traits>
 
-#include "kleidicv/kleidicv.h"
 #include "kleidicv/neon.h"
 #include "kleidicv/transform/remap.h"
 
@@ -138,6 +136,61 @@ class RemapS16Point5Replicate<uint8_t> {
   int16x8_t v_ymax_;
 };  // end of class RemapS16Point5Replicate<uint8_t>
 
+// Common interpolation function used by all RemapS16Point5 operations except
+// 1-channel u8 with replicated borders (RemapS16Point5Replicate<uint8_t>)
+// because that processes one half vector in one step
+static uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c,
+                              uint16x8_t d, uint16x8_t xfrac, uint16x8_t yfrac,
+                              uint16x8_t nxfrac, uint16x8_t nyfrac) {
+  auto interpolate_horizontal = [](uint16x4_t left, uint16x4_t right,
+                                   uint16x4_t frac,
+                                   uint16x4_t nfrac) -> uint32x4_t {
+    return vmlal_u16(vmull_u16(nfrac, left), frac, right);
+  };
+
+  auto interpolate_horizontal_low = [interpolate_horizontal](
+                                        uint16x8_t left, uint16x8_t right,
+                                        uint16x8_t frac,
+                                        uint16x8_t nfrac) -> uint32x4_t {
+    return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right),
+                                  vget_low_u16(frac), vget_low_u16(nfrac));
+  };
+
+  auto interpolate_horizontal_high = [interpolate_horizontal](
+                                         uint16x8_t left, uint16x8_t right,
+                                         uint16x8_t frac,
+                                         uint16x8_t nfrac) -> uint32x4_t {
+    return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right),
+                                  vget_high_u16(frac), vget_high_u16(nfrac));
+  };
+
+  // Offset pixel values by 0.5 before rounding down.
+  const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
+
+  auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac,
+                                  uint32x4_t nfrac) -> uint32x4_t {
+    uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac);
+    return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS);
+  };
+
+  uint32x4_t line0_low = interpolate_horizontal_low(a, b, xfrac, nxfrac);
+  uint32x4_t line1_low = interpolate_horizontal_low(c, d, xfrac, nxfrac);
+  uint32x4_t line0_high = interpolate_horizontal_high(a, b, xfrac, nxfrac);
+  uint32x4_t line1_high = interpolate_horizontal_high(c, d, xfrac, nxfrac);
+
+  uint32x4_t lo =
+      interpolate_vertical(line0_low, line1_low, vmovl_u16(vget_low_u16(yfrac)),
+                           vmovl_u16(vget_low_u16(nyfrac)));
+  uint32x4_t hi = interpolate_vertical(
+      line0_high, line1_high, vmovl_high_u16(yfrac), vmovl_high_u16(nyfrac));
+
+  // Discard upper 16 bits of each element (low the precision back to original
+  // 16 bits)
+  uint16x8_t result =
+      vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi));
+  return result;
+}
+
 template <>
 class RemapS16Point5Replicate<uint16_t> {
  public:
@@ -207,11 +260,12 @@ class RemapS16Point5Replicate<uint16_t> {
 
   void transform_pixels(Columns<uint16_t> dst) {
     uint16x8_t a = load_pixels(x0_, y0_);
-    uint16x8_t b = load_pixels(x0_, y1_);
-    uint16x8_t c = load_pixels(x1_, y0_);
+    uint16x8_t b = load_pixels(x1_, y0_);
+    uint16x8_t c = load_pixels(x0_, y1_);
     uint16x8_t d = load_pixels(x1_, y1_);
 
-    uint16x8_t result = interpolate(a, b, c, d);
+    uint16x8_t result =
+        interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_);
 
     vst1q_u16(&dst[0], result);
   }
@@ -245,56 +299,6 @@ class RemapS16Point5Replicate<uint16_t> {
     return pixels;
   }
 
-  uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c,
-                         uint16x8_t d) {
-    auto interpolate_horizontal = [&](uint16x4_t left, uint16x4_t right,
-                                      uint16x4_t frac,
-                                      uint16x4_t nfrac) -> uint32x4_t {
-      return vmlal_u16(vmull_u16(nfrac, left), frac, right);
-    };
-
-    auto interpolate_horizontal_low = [&](uint16x8_t left, uint16x8_t right,
-                                          uint16x8_t frac,
-                                          uint16x8_t nfrac) -> uint32x4_t {
-      return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right),
-                                    vget_low_u16(frac), vget_low_u16(nfrac));
-    };
-
-    auto interpolate_horizontal_high = [&](uint16x8_t left, uint16x8_t right,
-                                           uint16x8_t frac,
-                                           uint16x8_t nfrac) -> uint32x4_t {
-      return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right),
-                                    vget_high_u16(frac), vget_high_u16(nfrac));
-    };
-
-    // Offset pixel values by 0.5 before rounding down.
-    const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
-
-    auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac,
-                                    uint32x4_t nfrac) -> uint32x4_t {
-      uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac);
-      return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS);
-    };
-
-    uint32x4_t line0_low = interpolate_horizontal_low(a, c, xfrac_, nxfrac_);
-    uint32x4_t line1_low = interpolate_horizontal_low(b, d, xfrac_, nxfrac_);
-    uint32x4_t line0_high = interpolate_horizontal_high(a, c, xfrac_, nxfrac_);
-    uint32x4_t line1_high = interpolate_horizontal_high(b, d, xfrac_, nxfrac_);
-
-    uint32x4_t lo = interpolate_vertical(line0_low, line1_low,
-                                         vmovl_u16(vget_low_u16(yfrac_)),
-                                         vmovl_u16(vget_low_u16(nyfrac_)));
-    uint32x4_t hi =
-        interpolate_vertical(line0_high, line1_high, vmovl_high_u16(yfrac_),
-                             vmovl_high_u16(nyfrac_));
-
-    // Discard upper 16 bits of each element (low the precision back to original
-    // 16 bits)
-    uint16x8_t result =
-        vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi));
-    return result;
-  }
-
  private:
   Rows<const ScalarType> src_rows_;
   uint16x8_t v_src_element_stride_;
@@ -326,20 +330,20 @@ class RemapS16Point5ConstantBorder<uint8_t> {
         v_src_stride_{vdupq_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
         v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))},
         v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))},
-        v_border_{vdup_n_u8(*border_value)} {}
+        v_border_{vdupq_n_u16(static_cast<uint16_t>(*border_value))} {}
 
   void process_row(size_t width, Columns<const int16_t> mapxy,
                    Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
     auto vector_path = [&](size_t step) {
       int16x8x2_t xy = vld2q_s16(&mapxy[0]);
       uint16x8_t frac = vld1q_u16(&mapfrac[0]);
-      uint8x8_t frac_max = vdup_n_u8(REMAP16POINT5_FRAC_MAX);
-      uint8x8_t frac_mask = vdup_n_u8(REMAP16POINT5_FRAC_MAX - 1);
-      uint8x8_t xfrac = vand_u8(vmovn_u16(frac), frac_mask);
-      uint8x8_t yfrac =
-          vand_u8(vshrn_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask);
-      uint8x8_t nxfrac = vsub_u8(frac_max, xfrac);
-      uint8x8_t nyfrac = vsub_u8(frac_max, yfrac);
+      uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX);
+      uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1);
+      uint16x8_t xfrac = vandq_u16(frac, frac_mask);
+      uint16x8_t yfrac =
+          vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask);
+      uint16x8_t nxfrac = vsubq_u16(frac_max, xfrac);
+      uint16x8_t nyfrac = vsubq_u16(frac_max, yfrac);
 
       uint16x8_t one = vdupq_n_u16(1);
       uint16x8_t x0 = vreinterpretq_u16_s16(xy.val[0]);
@@ -347,19 +351,18 @@ class RemapS16Point5ConstantBorder<uint8_t> {
       uint16x8_t x1 = vaddq_u16(x0, one);
       uint16x8_t y1 = vaddq_u16(y0, one);
 
-      uint8x8_t v00 = load_pixels_or_constant_border(
+      uint16x8_t a = load_pixels_or_constant_border(
           src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y0);
-      uint8x8_t v01 = load_pixels_or_constant_border(
-          src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y1);
-      uint8x8_t v10 = load_pixels_or_constant_border(
+      uint16x8_t b = load_pixels_or_constant_border(
           src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y0);
-      uint8x8_t v11 = load_pixels_or_constant_border(
+      uint16x8_t c = load_pixels_or_constant_border(
+          src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y1);
+      uint16x8_t d = load_pixels_or_constant_border(
           src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y1);
 
-      uint8x8_t result = interpolate(v00, v01, v10, v11, xfrac, vmovl_u8(yfrac),
-                                     nxfrac, vmovl_u8(nyfrac));
+      uint16x8_t result = interpolate(a, b, c, d, xfrac, yfrac, nxfrac, nyfrac);
 
-      vst1_u8(&dst[0], result);
+      vst1_u8(&dst[0], vqmovn_u16(result));
       mapxy += ptrdiff_t(step);
       mapfrac += ptrdiff_t(step);
       dst += ptrdiff_t(step);
@@ -375,12 +378,12 @@ class RemapS16Point5ConstantBorder<uint8_t> {
   }
 
  private:
-  uint8x8_t load_pixels_or_constant_border(Rows<const uint8_t> &src_rows_,
-                                           uint16x8_t v_src_element_stride_,
-                                           uint16x8_t v_width_,
-                                           uint16x8_t v_height_,
-                                           uint8x8_t v_border_, uint16x8_t x,
-                                           uint16x8_t y) {
+  uint16x8_t load_pixels_or_constant_border(Rows<const uint8_t> &src_rows_,
+                                            uint16x8_t v_src_element_stride_,
+                                            uint16x8_t v_width_,
+                                            uint16x8_t v_height_,
+                                            uint16x8_t v_border_, uint16x8_t x,
+                                            uint16x8_t y) {
     // Find whether coordinates are within the image dimensions.
     // Negative coordinates are interpreted as large values due to the s16->u16
     // reinterpretation.
@@ -411,45 +414,14 @@ class RemapS16Point5ConstantBorder<uint8_t> {
         src_rows_[vgetq_lane_u32(indices_high, 3)],
     };
     // Select between source pixels and border colour
-    return vbsl_u8(vmovn_u16(in_range), pixels, v_border_);
-  }
-
-  uint8x8_t interpolate(uint8x8_t v00, uint8x8_t v01, uint8x8_t v10,
-                        uint8x8_t v11, uint8x8_t xfrac, uint16x8_t yfrac,
-                        uint8x8_t nxfrac, uint16x8_t nyfrac) {
-    auto interpolate_horizontal = [&](uint8x8_t left, uint8x8_t right) {
-      return vmlal_u8(vmull_u8(nxfrac, left), xfrac, right);
-    };
-
-    // Offset pixel values from [0,255] to [0.5,255.5] before rounding down.
-    const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
-
-    auto interpolate_vertical = [&](uint16x4_t a, uint16x4_t b, uint16x4_t frac,
-                                    uint16x4_t nfrac) {
-      uint32x4_t res32 = vmlal_u16(vmlal_u16(bias, a, nfrac), b, frac);
-      return vshrn_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS);
-    };
-
-    uint16x8_t line0 = interpolate_horizontal(v00, v10);
-    uint16x8_t line1 = interpolate_horizontal(v01, v11);
-
-    uint16x4_t lo =
-        interpolate_vertical(vget_low_u16(line0), vget_low_u16(line1),
-                             vget_low_u16(yfrac), vget_low_u16(nyfrac));
-    uint16x4_t hi =
-        interpolate_vertical(vget_high_u16(line0), vget_high_u16(line1),
-                             vget_high_u16(yfrac), vget_high_u16(nyfrac));
-
-    // Discard upper 8 bits of each element and combine low and high parts into
-    // a single register.
-    return vuzp1_u8(vreinterpret_u8_u16(lo), vreinterpret_u8_u16(hi));
+    return vbslq_u16(in_range, vmovl_u8(pixels), v_border_);
   }
 
   Rows<const ScalarType> src_rows_;
   uint16x8_t v_src_stride_;
   uint16x8_t v_width_;
   uint16x8_t v_height_;
-  uint8x8_t v_border_;
+  uint16x8_t v_border_;
 };  // end of class RemapS16Point5ConstantBorder<uint8_t>
 
 template <>
@@ -516,11 +488,12 @@ class RemapS16Point5ConstantBorder<uint16_t> {
 
   void transform_pixels(Columns<uint16_t> dst) {
     uint16x8_t a = load_pixels(x0_, y0_);
-    uint16x8_t b = load_pixels(x0_, y1_);
-    uint16x8_t c = load_pixels(x1_, y0_);
+    uint16x8_t b = load_pixels(x1_, y0_);
+    uint16x8_t c = load_pixels(x0_, y1_);
     uint16x8_t d = load_pixels(x1_, y1_);
 
-    uint16x8_t result = interpolate(a, b, c, d);
+    uint16x8_t result =
+        interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_);
 
     vst1q_u16(&dst[0], result);
   }
@@ -559,56 +532,6 @@ class RemapS16Point5ConstantBorder<uint16_t> {
     return vbslq_u16(in_range, pixels, v_border_);
   }
 
-  uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c,
-                         uint16x8_t d) {
-    auto interpolate_horizontal = [&](uint16x4_t left, uint16x4_t right,
-                                      uint16x4_t frac,
-                                      uint16x4_t nfrac) -> uint32x4_t {
-      return vmlal_u16(vmull_u16(nfrac, left), frac, right);
-    };
-
-    auto interpolate_horizontal_low = [&](uint16x8_t left, uint16x8_t right,
-                                          uint16x8_t frac,
-                                          uint16x8_t nfrac) -> uint32x4_t {
-      return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right),
-                                    vget_low_u16(frac), vget_low_u16(nfrac));
-    };
-
-    auto interpolate_horizontal_high = [&](uint16x8_t left, uint16x8_t right,
-                                           uint16x8_t frac,
-                                           uint16x8_t nfrac) -> uint32x4_t {
-      return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right),
-                                    vget_high_u16(frac), vget_high_u16(nfrac));
-    };
-
-    // Offset pixel values by 0.5 before rounding down.
-    const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
-
-    auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac,
-                                    uint32x4_t nfrac) -> uint32x4_t {
-      uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac);
-      return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS);
-    };
-
-    uint32x4_t line0_low = interpolate_horizontal_low(a, c, xfrac_, nxfrac_);
-    uint32x4_t line1_low = interpolate_horizontal_low(b, d, xfrac_, nxfrac_);
-    uint32x4_t line0_high = interpolate_horizontal_high(a, c, xfrac_, nxfrac_);
-    uint32x4_t line1_high = interpolate_horizontal_high(b, d, xfrac_, nxfrac_);
-
-    uint32x4_t lo = interpolate_vertical(line0_low, line1_low,
-                                         vmovl_u16(vget_low_u16(yfrac_)),
-                                         vmovl_u16(vget_low_u16(nyfrac_)));
-    uint32x4_t hi =
-        interpolate_vertical(line0_high, line1_high, vmovl_high_u16(yfrac_),
-                             vmovl_high_u16(nyfrac_));
-
-    // Discard upper 16 bits of each element (low the precision back to original
-    // 16 bits)
-    uint16x8_t result =
-        vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi));
-    return result;
-  }
-
  private:
   Rows<const ScalarType> src_rows_;
   uint16x8_t v_src_element_stride_;
@@ -625,6 +548,312 @@ class RemapS16Point5ConstantBorder<uint16_t> {
   int16x8_t y1_;
 };  // end of class RemapS16Point5ConstantBorder<uint16_t>
 
+inline void get_coordinates(Columns<const int16_t> mapxy,
+                            Columns<const uint16_t> mapfrac, uint16x8_t &x,
+                            uint16x8_t &y, uint16x8_t &xfrac,
+                            uint16x8_t &yfrac) {
+  int16x8x2_t xy = vld2q_s16(&mapxy[0]);
+  x = xy.val[0];
+  y = xy.val[1];
+
+  uint16x8_t frac = vld1q_u16(&mapfrac[0]);
+  xfrac = vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1));
+  yfrac = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS),
+                    vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1));
+}
+
+inline void get_offsets_4ch(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1,
+                            uint16x4_t y1, uint32x4_t &offsets_a,
+                            uint32x4_t &offsets_b, uint32x4_t &offsets_c,
+                            uint32x4_t &offsets_d,
+                            uint16x4_t v_src_element_stride) {
+  // Multiply by 4 because of channels
+  uint32x4_t x0_scaled = vshll_n_u16(x0, 2);
+  uint32x4_t x1_scaled = vshll_n_u16(x1, 2);
+
+  // Calculate offsets from coordinates (y * element_stride + x)
+  // a: top left, b: top right, c: bottom left, d: bottom right
+  offsets_a = vmlal_u16(x0_scaled, y0, v_src_element_stride);
+  offsets_b = vmlal_u16(x1_scaled, y0, v_src_element_stride);
+  offsets_c = vmlal_u16(x0_scaled, y1, v_src_element_stride);
+  offsets_d = vmlal_u16(x1_scaled, y1, v_src_element_stride);
+}
+
+inline uint16x8_t create_frac_low_high_u8_4ch(uint8_t frac_low,
+                                              uint8_t frac_high) {
+  uint8x8_t frac_low_high = {frac_low,  frac_low,  frac_low,  frac_low,
+                             frac_high, frac_high, frac_high, frac_high};
+  return vmovl_u8(frac_low_high);
+}
+
+inline uint64_t load_32bit(const uint8_t *src) {
+  uint32_t value = 0;
+  memcpy(&value, src, sizeof(uint32_t));
+  return static_cast<uint64_t>(value);
+}
+
+inline uint8x16_t load_4px_4ch(Rows<const uint8_t> src_rows,
+                               uint32x4_t offsets) {
+  uint64_t pixels01 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 0)]) |
+                      (load_32bit(&src_rows[vgetq_lane_u32(offsets, 1)]) << 32);
+  uint64_t pixels23 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 2)]) |
+                      (load_32bit(&src_rows[vgetq_lane_u32(offsets, 3)]) << 32);
+  return vcombine(vcreate_u8(pixels01), vcreate_u8(pixels23));
+}
+
+inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns<uint8_t> dst) {
+  vst1q_u8_x2(&dst[0], res);
+}
+
+inline uint16x8_t load_2px_4ch(Rows<const uint16_t> src_rows,
+                               uint32x2_t offsets) {
+  return vcombine(vld1_u16(&src_rows[vget_lane_u32(offsets, 0)]),
+                  vld1_u16(&src_rows[vget_lane_u32(offsets, 1)]));
+}
+
+inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns<uint16_t> dst) {
+  vst1q_u16_x4(&dst[0], res);
+}
+
+// Replicate border specific functions
+inline void get_coordinates_replicate(Columns<const int16_t> mapxy,
+                                      Columns<const uint16_t> mapfrac,
+                                      uint16x8_t &x0, uint16x8_t &y0,
+                                      uint16x8_t &x1, uint16x8_t &y1,
+                                      uint16x8_t &xfrac, uint16x8_t &yfrac,
+                                      int16x8_t v_xmax, int16x8_t v_ymax) {
+  get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac);
+
+  // Zero the xfrac (or yfrac) if x (or y) are below zero
+  xfrac = vbslq_u16(vcltq_s16(x0, vdupq_n_s16(0)), vdupq_n_u16(0), xfrac);
+  yfrac = vbslq_u16(vcltq_s16(y0, vdupq_n_s16(0)), vdupq_n_u16(0), yfrac);
+
+  // Clamp coordinates to within the dimensions of the source image
+  x0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(x0, v_xmax)));
+  y0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(y0, v_ymax)));
+
+  // x1 = x0 + 1, except if it's already xmax
+  x1 = vsubq_u16(x0, vcltq_s16(x0, v_xmax));
+  y1 = vsubq_u16(y0, vcltq_s16(y0, v_ymax));
+}
+
+inline void load_pixels_u8_4ch_replicate(
+    Rows<const uint8_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
+    uint32x4_t offsets_c, uint32x4_t offsets_d, uint8x16_t &a, uint8x16_t &b,
+    uint8x16_t &c, uint8x16_t &d) {
+  a = load_4px_4ch(src_rows, offsets_a);
+  b = load_4px_4ch(src_rows, offsets_b);
+  c = load_4px_4ch(src_rows, offsets_c);
+  d = load_4px_4ch(src_rows, offsets_d);
+}
+
+inline void load_pixels_u16_4ch_replicate(
+    Rows<const uint16_t> src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b,
+    uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x8_t &a_lo,
+    uint16x8_t &a_hi, uint16x8_t &b_lo, uint16x8_t &b_hi, uint16x8_t &c_lo,
+    uint16x8_t &c_hi, uint16x8_t &d_lo, uint16x8_t &d_hi) {
+  a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a));
+  b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b));
+  c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c));
+  d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d));
+
+  a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a));
+  b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b));
+  c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c));
+  d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d));
+}
+
+template <typename ScalarType>
+class RemapS16Point5Replicate4ch;
+
+template <>
+class RemapS16Point5Replicate4ch<uint8_t> {
+ public:
+  using ScalarType = uint8_t;
+  using MapVecTraits = neon::VecTraits<int16_t>;
+
+  RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
+                             size_t src_height)
+      : src_rows_{src_rows},
+        v_src_stride_{vdup_n_u16(static_cast<uint16_t>(src_rows_.stride()))},
+        v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
+        v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
+
+  void process_row(size_t width, Columns<const int16_t> mapxy,
+                   Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
+    auto vector_path = [&](size_t step) {
+      uint16x8_t x0, y0, x1, y1;
+      uint16x8_t xfrac, yfrac;
+      get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac,
+                                v_xmax_, v_ymax_);
+
+      uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
+      uint8x16_t a, b, c, d;
+      uint8x16x2_t res;
+
+      get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
+                      vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
+                      offsets_d, v_src_stride_);
+      load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
+                                   offsets_d, a, b, c, d);
+
+      // Doubled fractions 001122..., low part
+      uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
+      uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
+      uint16x8_t nxfrac2 =
+          vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
+      uint16x8_t nyfrac2 =
+          vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
+      // Quadrupled fractions (00001111) are passed to interpolate
+      uint16x8_t res0 = interpolate(
+          vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)),
+          vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
+          vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
+      uint16x8_t res1 = interpolate(
+          vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
+          vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2),
+          vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2));
+      res.val[0] =
+          vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
+
+      get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
+                      vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
+                      offsets_d, v_src_stride_);
+      load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
+                                   offsets_d, a, b, c, d);
+      // Doubled fractions 001122..., high part
+      xfrac2 = vzip2q(xfrac, xfrac);
+      yfrac2 = vzip2q(yfrac, yfrac);
+      nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
+      nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
+      // Quadrupled fractions (00001111) are passed to interpolate
+      res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)),
+                         vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)),
+                         vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2),
+                         vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2));
+      res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c),
+                         vmovl_high_u8(d), vzip2q(xfrac2, xfrac2),
+                         vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
+                         vzip2q(nyfrac2, nyfrac2));
+      res.val[1] =
+          vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1));
+
+      store_pixels_u8_4ch(res, dst);
+      mapxy += ptrdiff_t(step);
+      mapfrac += ptrdiff_t(step);
+      dst += ptrdiff_t(step);
+    };
+
+    LoopUnroll loop{width, MapVecTraits::num_lanes()};
+    loop.unroll_once(vector_path);
+    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
+                          static_cast<ptrdiff_t>(loop.remaining_length());
+    mapxy -= back_step;
+    mapfrac -= back_step;
+    dst -= back_step;
+    loop.remaining([&](size_t, size_t step) { vector_path(step); });
+  }
+
+ private:
+  Rows<const ScalarType> src_rows_;
+  uint16x4_t v_src_stride_;
+  int16x8_t v_xmax_;
+  int16x8_t v_ymax_;
+};  // end of class RemapS16Point5Replicate4ch<uint8_t>
+
+template <>
+class RemapS16Point5Replicate4ch<uint16_t> {
+ public:
+  using ScalarType = uint16_t;
+  using MapVecTraits = neon::VecTraits<int16_t>;
+
+  RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
+                             size_t src_height)
+      : src_rows_{src_rows},
+        v_src_element_stride_{vdup_n_u16(
+            static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))},
+        v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))},
+        v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {}
+
+  void process_row(size_t width, Columns<const int16_t> mapxy,
+                   Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
+    auto vector_path = [&](size_t step) {
+      uint16x8_t x0, y0, x1, y1;
+      uint16x8_t xfrac, yfrac;
+      get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac,
+                                v_xmax_, v_ymax_);
+
+      uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d;
+      uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high;
+      uint16x8x4_t res;
+      get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1),
+                      vget_low_u16(y1), offsets_a, offsets_b, offsets_c,
+                      offsets_d, v_src_element_stride_);
+      load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
+                                    offsets_d, a_low, a_high, b_low, b_high,
+                                    c_low, c_high, d_low, d_high);
+
+      // Doubled fractions 001122..., low part
+      uint16x8_t xfrac2 = vzip1q(xfrac, xfrac);
+      uint16x8_t yfrac2 = vzip1q(yfrac, yfrac);
+      uint16x8_t nxfrac2 =
+          vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
+      uint16x8_t nyfrac2 =
+          vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
+      // Quadrupled fractions (00001111) are passed to interpolate
+      res.val[0] =
+          interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
+                      vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
+                      vzip1q(nyfrac2, nyfrac2));
+      res.val[1] =
+          interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
+                      vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
+                      vzip2q(nyfrac2, nyfrac2));
+
+      get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1),
+                      vget_high_u16(y1), offsets_a, offsets_b, offsets_c,
+                      offsets_d, v_src_element_stride_);
+      load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c,
+                                    offsets_d, a_low, a_high, b_low, b_high,
+                                    c_low, c_high, d_low, d_high);
+      // Doubled fractions 001122..., high part
+      xfrac2 = vzip2q(xfrac, xfrac);
+      yfrac2 = vzip2q(yfrac, yfrac);
+      nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2);
+      nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2);
+      // Quadrupled fractions (00001111) are passed to interpolate
+      res.val[2] =
+          interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2),
+                      vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2),
+                      vzip1q(nyfrac2, nyfrac2));
+      res.val[3] =
+          interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2),
+                      vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2),
+                      vzip2q(nyfrac2, nyfrac2));
+
+      store_pixels_u16_4ch(res, dst);
+      mapxy += ptrdiff_t(step);
+      mapfrac += ptrdiff_t(step);
+      dst += ptrdiff_t(step);
+    };
+
+    LoopUnroll loop{width, MapVecTraits::num_lanes()};
+    loop.unroll_once(vector_path);
+    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
+                          static_cast<ptrdiff_t>(loop.remaining_length());
+    mapxy -= back_step;
+    mapfrac -= back_step;
+    dst -= back_step;
+    loop.remaining([&](size_t, size_t step) { vector_path(step); });
+  }
+
+ private:
+  Rows<const ScalarType> src_rows_;
+  uint16x4_t v_src_element_stride_;
+  int16x8_t v_xmax_;
+  int16x8_t v_ymax_;
+};  // end of class RemapS16Point5Replicate4ch<uint16_t>
+
 // Most of the complexity comes from parameter checking.
 // NOLINTBEGIN(readability-function-cognitive-complexity)
 template <typename T>
@@ -656,13 +885,24 @@ kleidicv_error_t remap_s16point5(
   Rows<T> dst_rows{dst, dst_stride, channels};
   Rectangle rect{dst_width, dst_height};
   if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
-    RemapS16Point5ConstantBorder<T> operation{src_rows, src_width, src_height,
-                                              border_value};
-    zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
+    if (channels == 1) {
+      RemapS16Point5ConstantBorder<T> operation{src_rows, src_width, src_height,
+                                                border_value};
+      zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
+    } else {
+      assert(channels == 4);
+      return KLEIDICV_ERROR_NOT_IMPLEMENTED;
+    }
   } else {
     assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
-    RemapS16Point5Replicate<T> operation{src_rows, src_width, src_height};
-    zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
+    if (channels == 1) {
+      RemapS16Point5Replicate<T> operation{src_rows, src_width, src_height};
+      zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
+    } else {
+      assert(channels == 4);
+      RemapS16Point5Replicate4ch<T> operation{src_rows, src_width, src_height};
+      zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
+    }
   }
   return KLEIDICV_OK;
 }
diff --git a/kleidicv/src/transform/remap_s16point5_sve2.cpp b/kleidicv/src/transform/remap_s16point5_sve2.cpp
index 489d37386..982717173 100644
--- a/kleidicv/src/transform/remap_s16point5_sve2.cpp
+++ b/kleidicv/src/transform/remap_s16point5_sve2.cpp
@@ -519,6 +519,357 @@ class RemapS16Point5ConstantBorder<uint16_t> {
   svuint16_t& v_border_;
 };  // end of class RemapS16Point5ConstantBorder<uint16_t>
 
+template <typename ScalarType>
+class RemapS16Point5Replicate4ch;
+
+template <>
+class RemapS16Point5Replicate4ch<uint8_t> {
+ public:
+  using ScalarType = uint8_t;
+  using MapVecTraits = VecTraits<int16_t>;
+  using MapVectorType = typename MapVecTraits::VectorType;
+  using MapVector2Type = typename MapVecTraits::Vector2Type;
+  using FracVecTraits = VecTraits<uint16_t>;
+  using FracVectorType = typename FracVecTraits::VectorType;
+
+  RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
+                             size_t src_height, svuint16_t& v_src_stride,
+                             MapVectorType& v_x_max, MapVectorType& v_y_max)
+      : src_rows_{src_rows},
+        v_src_stride_{v_src_stride},
+        v_xmax_{v_x_max},
+        v_ymax_{v_y_max} {
+    v_src_stride_ = svdup_u16(src_rows.stride());
+    v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1));
+    v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1));
+  }
+
+  void process_row(size_t width, Columns<const int16_t> mapxy,
+                   Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
+    LoopUnroll loop{width, MapVecTraits::num_lanes()};
+    loop.unroll_once([&](size_t step) {
+      svbool_t pg = MapVecTraits::svptrue();
+      vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(step));
+    });
+    loop.remaining([&](size_t length, size_t step) {
+      svbool_t pg = MapVecTraits::svwhilelt(step - length, step);
+      vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(length));
+    });
+  }
+
+  void vector_path(svbool_t pg, Columns<const int16_t>& mapxy,
+                   Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst,
+                   ptrdiff_t step) {
+    MapVector2Type xy = svld2_s16(pg, &mapxy[0]);
+    svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
+
+    // Clamp coordinates to within the dimensions of the source image
+    svuint16_t x0 = svreinterpret_u16_s16(
+        svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_)));
+    svuint16_t y0 = svreinterpret_u16_s16(
+        svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_)));
+
+    // x1 = x0 + 1, and clamp it too
+    svuint16_t x1 = svreinterpret_u16_s16(
+        svmax_x(pg, svdup_n_s16(0),
+                svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_)));
+
+    svuint16_t y1 = svreinterpret_u16_s16(
+        svmax_x(pg, svdup_n_s16(0),
+                svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_)));
+    svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2);
+    svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2);
+
+    // Calculate offsets from coordinates (y * stride + x), x multiplied by 4
+    // channels
+    auto load_4ch_b = [&](svuint16_t x, svuint16_t y) {
+      return svreinterpret_u8_u32(svld1_gather_u32offset_u32(
+          pg_b, reinterpret_cast<const uint32_t*>(&src_rows_[0]),
+          svmlalb_u32(svshllb_n_u32(x, 2), y, v_src_stride_)));
+    };
+    auto load_4ch_t = [&](svuint16_t x, svuint16_t y) {
+      return svreinterpret_u8_u32(svld1_gather_u32offset_u32(
+          pg_t, reinterpret_cast<const uint32_t*>(&src_rows_[0]),
+          svmlalt_u32(svshllt_n_u32(x, 2), y, v_src_stride_)));
+    };
+
+    FracVectorType frac = svld1_u16(pg, &mapfrac[0]);
+    svuint16_t xfrac =
+        svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
+    svuint16_t yfrac =
+        svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS),
+                svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
+
+    auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac,
+                      svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b,
+                      svuint16_t src_c, svuint16_t src_d, svuint32_t bias) {
+      svuint16_t line0 = svmla_x(
+          svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_b), nxfrac, src_a);
+      svuint16_t line1 = svmla_x(
+          svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_d), nxfrac, src_c);
+
+      svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac);
+      svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac);
+      acc_b = svmlalb_u32(acc_b, line1, yfrac);
+      acc_t = svmlalt_u32(acc_t, line1, yfrac);
+
+      return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t,
+                     2ULL * REMAP16POINT5_FRAC_BITS);
+    };
+
+    // bottom part
+    svuint8_t a = load_4ch_b(x0, y0);
+    svuint8_t b = load_4ch_b(x1, y0);
+    svuint8_t c = load_4ch_b(x0, y1);
+    svuint8_t d = load_4ch_b(x1, y1);
+    // from xfrac, we need the bottom part twice
+    svuint16_t xfrac2b = svtrn1_u16(xfrac, xfrac);
+    svuint16_t nxfrac2b = svsub_u16_x(
+        svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2b);
+    svuint16_t yfrac2b = svtrn1_u16(yfrac, yfrac);
+    svuint16_t nyfrac2b = svsub_u16_x(
+        svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2b);
+
+    // a,b,c,d looks like 12341234...(four channels)
+    // bottom is 1313...
+    svuint16_t res_bb =
+        lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlb_u16(a),
+               svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias);
+    // top is 2424...
+    svuint16_t res_bt =
+        lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlt_u16(a),
+               svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias);
+    svuint8_t res_b =
+        svtrn1_u8(svreinterpret_u8_u16(res_bb), svreinterpret_u8_u16(res_bt));
+
+    // top part
+    a = load_4ch_t(x0, y0);
+    b = load_4ch_t(x1, y0);
+    c = load_4ch_t(x0, y1);
+    d = load_4ch_t(x1, y1);
+    // from xfrac, we need the top part twice
+    svuint16_t xfrac2t = svtrn2_u16(xfrac, xfrac);
+    svuint16_t nxfrac2t = svsub_u16_x(
+        svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2t);
+    svuint16_t yfrac2t = svtrn2_u16(yfrac, yfrac);
+    svuint16_t nyfrac2t = svsub_u16_x(
+        svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2t);
+
+    // a,b,c,d looks like 12341234...(four channels)
+    // bottom is 1313...
+    svuint16_t res_tb =
+        lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlb_u16(a),
+               svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias);
+    // top is 2424...
+    svuint16_t res_tt =
+        lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlt_u16(a),
+               svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias);
+    svuint8_t res_t =
+        svtrn1_u8(svreinterpret_u8_u16(res_tb), svreinterpret_u8_u16(res_tt));
+
+    svbool_t pg_low = svwhilelt_b32(0L, step);
+    svbool_t pg_high = svwhilelt_b32(svcntw(), static_cast<size_t>(step));
+    svuint32_t res_low =
+        svzip1_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t));
+    svuint32_t res_high =
+        svzip2_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t));
+    mapxy += step;
+    svst1_u32(pg_low, reinterpret_cast<uint32_t*>(&dst[0]), res_low);
+    svst1_u32(pg_high, reinterpret_cast<uint32_t*>(&dst[0]) + svcntw(),
+              res_high);
+    mapfrac += step;
+    dst += step;
+  }
+
+  Rows<const ScalarType> src_rows_;
+
+ private:
+  svuint16_t& v_src_stride_;
+  MapVectorType& v_xmax_;
+  MapVectorType& v_ymax_;
+};  // end of class RemapS16Point5Replicate4ch<uint8_t>
+
+template <>
+class RemapS16Point5Replicate4ch<uint16_t> {
+ public:
+  using ScalarType = uint16_t;
+
+  RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width,
+                             size_t src_height, svuint32_t& v_src_stride,
+                             svint32_t& v_x_max, svint32_t& v_y_max)
+      : src_rows_{src_rows},
+        v_src_stride_{v_src_stride},
+        v_xmax_{v_x_max},
+        v_ymax_{v_y_max} {
+    v_src_stride_ = svdup_u32(src_rows.stride());
+    v_xmax_ = svdup_s32(static_cast<int32_t>(src_width - 1));
+    v_ymax_ = svdup_s32(static_cast<int32_t>(src_height - 1));
+  }
+
+  void process_row(size_t width, Columns<const int16_t> mapxy,
+                   Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) {
+    LoopUnroll loop{width, svcntw()};
+    loop.unroll_once([&](size_t step) {
+      vector_path(svptrue_b32(), svptrue_b64(), svptrue_b64(), svptrue_b64(),
+                  svptrue_b64(), mapxy, mapfrac, dst,
+                  static_cast<ptrdiff_t>(step));
+    });
+    loop.remaining([&](size_t length, size_t step) {
+      svbool_t pg = svwhilelt_b32(step, step + length);
+      svbool_t pg64_b = svtrn1_b32(pg, svpfalse());
+      svbool_t pg64_t = svtrn2_b32(pg, svpfalse());
+      svbool_t pg_low = svzip1_b32(pg, svpfalse());
+      svbool_t pg_high = svzip2_b32(pg, svpfalse());
+      vector_path(pg, pg64_b, pg64_t, pg_low, pg_high, mapxy, mapfrac, dst,
+                  static_cast<ptrdiff_t>(length));
+    });
+  }
+
+  void vector_path(svbool_t pg, svbool_t pg64_b, svbool_t pg64_t,
+                   svbool_t pg_low, svbool_t pg_high,
+                   Columns<const int16_t>& mapxy,
+                   Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst,
+                   ptrdiff_t step) {
+    // Load one vector of xy: even coordinates are x, odd are y
+    svint16_t xy = svreinterpret_s16_u32(
+        svld1_u32(pg, reinterpret_cast<const uint32_t*>(&mapxy[0])));
+    svint32_t x = svmovlb(xy);
+    svint32_t y = svmovlt(xy);
+    // Clamp coordinates to within the dimensions of the source image
+    svuint32_t x0 = svreinterpret_u32_s32(
+        svmax_x(pg, svdup_n_s32(0), svmin_x(pg, x, v_xmax_)));
+    svuint32_t y0 = svreinterpret_u32_s32(
+        svmax_x(pg, svdup_n_s32(0), svmin_x(pg, y, v_ymax_)));
+
+    // x1 = x0 + 1, and clamp it too
+    svuint32_t x1 = svreinterpret_u32_s32(svmax_x(
+        pg, svdup_n_s32(0), svmin_x(pg, svqadd_n_s32_x(pg, x, 1), v_xmax_)));
+    svuint32_t y1 = svreinterpret_u32_s32(svmax_x(
+        pg, svdup_n_s32(0), svmin_x(pg, svqadd_n_s32_x(pg, y, 1), v_ymax_)));
+
+    auto load_4ch = [&](svbool_t pg, svuint64_t offsets) {
+      return svreinterpret_u16_u64(svld1_gather_u64offset_u64(
+          pg, reinterpret_cast<const uint64_t*>(&src_rows_[0]), offsets));
+    };
+
+    svuint16_t xfrac, yfrac, nxfrac, nyfrac;
+    {
+      // Fractions are loaded into even lanes
+      svuint16_t rawfrac = svreinterpret_u16_u32(svld1uh_u32(pg, &mapfrac[0]));
+
+      // Fractions are doubled, 00112233... (will be doubled again later)
+      svuint16_t frac = svtrn1(rawfrac, rawfrac);
+
+      xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
+      yfrac = svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS),
+                      svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1));
+      nxfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac);
+      nyfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac);
+    }
+
+    svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2);
+
+    auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac,
+                      svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b,
+                      svuint16_t src_c, svuint16_t src_d, svuint32_t bias) {
+      svuint32_t line0_b = svmlalb(svmullb(xfrac, src_b), nxfrac, src_a);
+      svuint32_t line0_t = svmlalt(svmullt(xfrac, src_b), nxfrac, src_a);
+      svuint32_t line1_b = svmlalb(svmullb(xfrac, src_d), nxfrac, src_c);
+      svuint32_t line1_t = svmlalt(svmullt(xfrac, src_d), nxfrac, src_c);
+
+      svuint32_t acc_b =
+          svmla_u32_x(svptrue_b32(), bias, line0_b, svmovlb_u32(nyfrac));
+      svuint32_t acc_t =
+          svmla_u32_x(svptrue_b32(), bias, line0_t, svmovlt_u32(nyfrac));
+      acc_b = svmla_u32_x(svptrue_b32(), acc_b, line1_b, svmovlb_u32(yfrac));
+      acc_t = svmla_u32_x(svptrue_b32(), acc_t, line1_t, svmovlt_u32(yfrac));
+
+      return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t,
+                     2ULL * REMAP16POINT5_FRAC_BITS);
+    };
+
+    // Data is 4x16 = 64 bits, twice as wide as the widened coords (32-bit)
+    // Calculation is done in 2 parts, top and bottom
+    svuint16_t res_b, res_t;
+
+    {  // bottom
+      svuint64_t x0w = svshllb_n_u64(x0, 3);
+      svuint64_t x1w = svshllb_n_u64(x1, 3);
+      svuint64_t ys0w = svmullb_u64(y0, v_src_stride_);
+      svuint64_t ys1w = svmullb_u64(y1, v_src_stride_);
+      svuint64_t offsets_a = svadd_x(pg64_b, x0w, ys0w);
+      svuint64_t offsets_b = svadd_x(pg64_b, x1w, ys0w);
+      svuint64_t offsets_c = svadd_x(pg64_b, x0w, ys1w);
+      svuint64_t offsets_d = svadd_x(pg64_b, x1w, ys1w);
+
+      svuint16_t a = load_4ch(pg64_b, offsets_a);
+      svuint16_t b = load_4ch(pg64_b, offsets_b);
+      svuint16_t c = load_4ch(pg64_b, offsets_c);
+      svuint16_t d = load_4ch(pg64_b, offsets_d);
+
+      // Copy even lanes twice -> 000022224444... these are the "bottom"
+      // fractions
+      svuint16_t xfr = svreinterpret_u16_u32(svtrn1_u32(
+          svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac)));
+      svuint16_t nxfr = svreinterpret_u16_u32(svtrn1_u32(
+          svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac)));
+      svuint16_t yfr = svreinterpret_u16_u32(svtrn1_u32(
+          svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac)));
+      svuint16_t nyfr = svreinterpret_u16_u32(svtrn1_u32(
+          svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac)));
+
+      res_b = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias);
+    }
+
+    {  // top
+      svuint64_t x0w = svshllt_n_u64(x0, 3);
+      svuint64_t x1w = svshllt_n_u64(x1, 3);
+      svuint64_t ys0w = svmullt_u64(y0, v_src_stride_);
+      svuint64_t ys1w = svmullt_u64(y1, v_src_stride_);
+      svuint64_t offsets_a = svadd_x(pg64_b, x0w, ys0w);
+      svuint64_t offsets_b = svadd_x(pg64_b, x1w, ys0w);
+      svuint64_t offsets_c = svadd_x(pg64_b, x0w, ys1w);
+      svuint64_t offsets_d = svadd_x(pg64_b, x1w, ys1w);
+
+      svuint16_t a = load_4ch(pg64_t, offsets_a);
+      svuint16_t b = load_4ch(pg64_t, offsets_b);
+      svuint16_t c = load_4ch(pg64_t, offsets_c);
+      svuint16_t d = load_4ch(pg64_t, offsets_d);
+
+      // Copy odd lanes twice -> 111133335555... these are the "top"
+      // fractions
+      svuint16_t xfr = svreinterpret_u16_u32(svtrn2_u32(
+          svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac)));
+      svuint16_t nxfr = svreinterpret_u16_u32(svtrn2_u32(
+          svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac)));
+      svuint16_t yfr = svreinterpret_u16_u32(svtrn2_u32(
+          svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac)));
+      svuint16_t nyfr = svreinterpret_u16_u32(svtrn2_u32(
+          svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac)));
+
+      res_t = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias);
+    }
+
+    svuint64_t res_low =
+        svzip1_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t));
+    svuint64_t res_high =
+        svzip2_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t));
+    svst1_u64(pg_low, reinterpret_cast<uint64_t*>(&dst[0]), res_low);
+    svst1_u64(pg_high, reinterpret_cast<uint64_t*>(&dst[0]) + svcntd(),
+              res_high);
+    mapxy += step;
+    mapfrac += step;
+    dst += step;
+  }
+
+  Rows<const ScalarType> src_rows_;
+
+ private:
+  svuint32_t& v_src_stride_;
+  svint32_t& v_xmax_;
+  svint32_t& v_ymax_;
+};  // end of class RemapS16Point5Replicate4ch<uint16_t>
+
 // Most of the complexity comes from parameter checking.
 // NOLINTBEGIN(readability-function-cognitive-complexity)
 template <typename T>
@@ -554,16 +905,37 @@ kleidicv_error_t remap_s16point5(const T* src, size_t src_stride,
 
   if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
     svuint16_t sv_width, sv_height, sv_border;
-    RemapS16Point5ConstantBorder<T> operation{
-        src_rows,      src_width, src_height, border_value,
-        sv_src_stride, sv_width,  sv_height,  sv_border};
-    zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
+    if (channels == 1) {
+      RemapS16Point5ConstantBorder<T> operation{
+          src_rows,      src_width, src_height, border_value,
+          sv_src_stride, sv_width,  sv_height,  sv_border};
+      zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
+    } else {
+      assert(channels == 4);
+      return KLEIDICV_ERROR_NOT_IMPLEMENTED;
+    }
   } else {
     assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
     svint16_t sv_xmax, sv_ymax;
-    RemapS16Point5Replicate<T> operation{src_rows,      src_width, src_height,
-                                         sv_src_stride, sv_xmax,   sv_ymax};
-    zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
+    if (channels == 1) {
+      RemapS16Point5Replicate<T> operation{src_rows,      src_width, src_height,
+                                           sv_src_stride, sv_xmax,   sv_ymax};
+      zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
+    } else {
+      assert(channels == 4);
+      if constexpr (std::is_same<T, uint8_t>::value) {
+        RemapS16Point5Replicate4ch<T> operation{
+            src_rows, src_width, src_height, sv_src_stride, sv_xmax, sv_ymax};
+        zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
+      }
+      if constexpr (std::is_same<T, uint16_t>::value) {
+        svuint32_t stride;
+        svint32_t xmax, ymax;
+        RemapS16Point5Replicate4ch<T> operation{src_rows, src_width, src_height,
+                                                stride,   xmax,      ymax};
+        zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows);
+      }
+    }
   }
   return KLEIDICV_OK;
 }
diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt
index ca82aeda0..c293ef1cb 100755
--- a/scripts/benchmark/benchmarks.txt
+++ b/scripts/benchmark/benchmarks.txt
@@ -83,8 +83,10 @@ Remap_S16_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16S
 Remap_S16_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)'
 Remap_S16_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_CONSTANT)'
 Remap_S16Point5_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)'
+Remap_S16Point5_U8_Replicate_4ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC4, 16SC2, INTER_LINEAR, BORDER_REPLICATE)'
 Remap_S16Point5_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)'
 Remap_S16Point5_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)'
+Remap_S16Point5_U16_Replicate_4ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC4, 16SC2, INTER_LINEAR, BORDER_REPLICATE)'
 Remap_S16Point5_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)'
 Remap_F32_U8_Replicate_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_REPLICATE)'
 Remap_F32_U8_Constant_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_CONSTANT)'
diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp
index bea82b90b..e85b611bc 100644
--- a/test/api/test_remap.cpp
+++ b/test/api/test_remap.cpp
@@ -36,14 +36,17 @@ template <typename ScalarType>
 static const ScalarType *get_array2d_element_or_border(
     const test::Array2D<ScalarType> &src, ptrdiff_t x, ptrdiff_t y,
     kleidicv_border_type_t border_type, const ScalarType *border_value) {
+  // Width is the number of pixels in a row, but Array2D does not handle that
+  const ptrdiff_t src_width =
+      static_cast<ptrdiff_t>(src.width() / src.channels());
+  const ptrdiff_t src_height = static_cast<ptrdiff_t>(src.height());
+
   if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) {
-    x = std::clamp<ptrdiff_t>(
-        x, 0, static_cast<ptrdiff_t>(src.width() / src.channels()) - 1);
-    y = std::clamp<ptrdiff_t>(y, 0, static_cast<ptrdiff_t>(src.height()) - 1);
+    x = std::clamp<ptrdiff_t>(x, 0, src_width - 1);
+    y = std::clamp<ptrdiff_t>(y, 0, src_height - 1);
   } else {
     assert(border_type == KLEIDICV_BORDER_TYPE_CONSTANT);
-    if (x * src.channels() >= src.width() ||
-        y >= static_cast<ptrdiff_t>(src.height()) || x < 0 || y < 0) {
+    if (x >= src_width || y >= src_height || x < 0 || y < 0) {
       return border_value;
     }
   }
@@ -147,12 +150,11 @@ class RemapS16 : public testing::Test {
 
     calculate_expected(source, mapxy, border_type, border_value, expected);
 
-    ASSERT_EQ(
-        KLEIDICV_OK,
-        remap_s16<ScalarType>()(
-            source.data(), source.stride(), source.width(), source.height(),
-            actual.data(), actual.stride(), actual.width(), actual.height(),
-            channels, mapxy.data(), mapxy.stride(), border_type, border_value));
+    ASSERT_EQ(KLEIDICV_OK, remap_s16<ScalarType>()(
+                               source.data(), source.stride(), src_w,
+                               source.height(), actual.data(), actual.stride(),
+                               dst_w, actual.height(), channels, mapxy.data(),
+                               mapxy.stride(), border_type, border_value));
 
     EXPECT_EQ_ARRAY2D(actual, expected);
   }
@@ -176,12 +178,11 @@ class RemapS16 : public testing::Test {
 
     calculate_expected(source, mapxy, border_type, border_value, expected);
 
-    ASSERT_EQ(
-        KLEIDICV_OK,
-        remap_s16<ScalarType>()(
-            source.data(), source.stride(), source.width(), source.height(),
-            actual.data(), actual.stride(), actual.width(), actual.height(),
-            channels, mapxy.data(), mapxy.stride(), border_type, border_value));
+    ASSERT_EQ(KLEIDICV_OK, remap_s16<ScalarType>()(
+                               source.data(), source.stride(), src_w,
+                               source.height(), actual.data(), actual.stride(),
+                               dst_w, actual.height(), channels, mapxy.data(),
+                               mapxy.stride(), border_type, border_value));
 
     EXPECT_EQ_ARRAY2D(actual, expected);
   }
@@ -524,8 +525,9 @@ class RemapS16Point5 : public testing::Test {
       }
     }
 
-    // This part is the same as execute_test() but without initializing source.
-    // Corner Cases use the biggest possible source.
+    // This part is the same as execute_test() except source initialization.
+    // Corner Cases use the biggest possible source, so it is only initializing
+    // the edges.
     size_t src_total_width = channels * src_w;
     size_t dst_total_width = channels * dst_w;
 
@@ -559,12 +561,27 @@ class RemapS16Point5 : public testing::Test {
     calculate_expected(source, mapxy, mapfrac, border_type, border_value,
                        expected);
 
-    ASSERT_EQ(KLEIDICV_OK, remap_s16point5<ScalarType>()(
-                               source.data(), source.stride(), source.width(),
-                               source.height(), actual.data(), actual.stride(),
-                               actual.width(), actual.height(), channels,
-                               mapxy.data(), mapxy.stride(), mapfrac.data(),
-                               mapfrac.stride(), border_type, border_value));
+    ASSERT_EQ(KLEIDICV_OK,
+              remap_s16point5<ScalarType>()(
+                  source.data(), source.stride(), src_w, source.height(),
+                  actual.data(), actual.stride(), dst_w, actual.height(),
+                  channels, mapxy.data(), mapxy.stride(), mapfrac.data(),
+                  mapfrac.stride(), border_type, border_value));
+
+    if (expected.compare_to(actual, 1)) {
+      if (source.width() < 100 && source.height() < 100) {
+        std::cout << "source:\n";
+        dump(&source);
+      }
+      std::cout << "mapxy:\n";
+      dump(&mapxy);
+      std::cout << "mapfrac:\n";
+      dump(&mapfrac);
+      std::cout << "expected:\n";
+      dump(&expected);
+      std::cout << "actual:\n";
+      dump(&actual);
+    }
 
     EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected);
   }
@@ -589,12 +606,27 @@ class RemapS16Point5 : public testing::Test {
     calculate_expected(source, mapxy, mapfrac, border_type, border_value,
                        expected);
 
-    ASSERT_EQ(KLEIDICV_OK, remap_s16point5<ScalarType>()(
-                               source.data(), source.stride(), source.width(),
-                               source.height(), actual.data(), actual.stride(),
-                               actual.width(), actual.height(), channels,
-                               mapxy.data(), mapxy.stride(), mapfrac.data(),
-                               mapfrac.stride(), border_type, border_value));
+    ASSERT_EQ(KLEIDICV_OK,
+              remap_s16point5<ScalarType>()(
+                  source.data(), source.stride(), src_w, source.height(),
+                  actual.data(), actual.stride(), dst_w, actual.height(),
+                  channels, mapxy.data(), mapxy.stride(), mapfrac.data(),
+                  mapfrac.stride(), border_type, border_value));
+
+    if (expected.compare_to(actual)) {
+      if (source.width() < 100 && source.height() < 100) {
+        std::cout << "source:\n";
+        dump(&source);
+      }
+      std::cout << "mapxy:\n";
+      dump(&mapxy);
+      std::cout << "mapfrac:\n";
+      dump(&mapfrac);
+      std::cout << "expected:\n";
+      dump(&expected);
+      std::cout << "actual:\n";
+      dump(&actual);
+    }
 
     EXPECT_EQ_ARRAY2D(actual, expected);
   }
@@ -623,17 +655,17 @@ class RemapS16Point5 : public testing::Test {
     for (size_t row = 0; row < expected.height(); row++) {
       for (size_t column = 0; column < expected.width() / src.channels();
            ++column) {
+        // Clang-tidy thinks mapfrac may contain garbage, but it is fully
+        // initialized at all code paths and the map size always equals dst
+        // map pixel size
+        // NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult)
+        uint8_t x_frac = *mapfrac.at(row, column) & (FRAC_MAX - 1);
+        uint8_t y_frac =
+            (*mapfrac.at(row, column) >> FRAC_BITS) & (FRAC_MAX - 1);
+        // NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult)
+        const int16_t *coords = mapxy.at(row, column * 2);
+        ptrdiff_t x = coords[0], y = coords[1];
         for (size_t ch = 0; ch < src.channels(); ++ch) {
-          // Clang-tidy thinks mapfrac may contain garbage, but it is fully
-          // initialized at all code paths and the map size always equals dst
-          // map pixel size
-          // NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult)
-          uint8_t x_frac = *mapfrac.at(row, column) & (FRAC_MAX - 1);
-          uint8_t y_frac =
-              (*mapfrac.at(row, column) >> FRAC_BITS) & (FRAC_MAX - 1);
-          // NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult)
-          const int16_t *coords = mapxy.at(row, column * 2);
-          int16_t x = coords[0], y = coords[1];
           *expected.at(row, column * src.channels() + ch) =
               lerp_2d(x_frac, y_frac, get_src(x, y)[ch], get_src(x + 1, y)[ch],
                       get_src(x, y + 1)[ch], get_src(x + 1, y + 1)[ch]);
@@ -646,48 +678,69 @@ class RemapS16Point5 : public testing::Test {
 using RemapS16Point5ElementTypes = ::testing::Types<uint8_t, uint16_t>;
 TYPED_TEST_SUITE(RemapS16Point5, RemapS16Point5ElementTypes);
 
+template <typename T>
+size_t defaultWidth() {
+  return 3 * test::Options::vector_lanes<T>() - 1;
+}
+
+size_t defaultHeight() { return 4; }
+
 TYPED_TEST(RemapS16Point5, RandomNoPadding) {
-  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
-  size_t src_h = 4;
-  size_t dst_w = src_w;
-  size_t dst_h = src_h;
-  size_t channels = 1;
-  size_t padding = 0;
+  size_t w = defaultWidth<TypeParam>();
+  size_t h = defaultHeight();
   for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels, border_type,
-                             border_value, padding);
+    TestFixture::test_random(w, h, w, h, 1, border_type, border_value, 0);
   }
 }
 
+// TODO: Modify tests to also run constant border once implemented
+TYPED_TEST(RemapS16Point5, RandomNoPadding4chReplicate) {
+  size_t w = defaultWidth<TypeParam>();
+  size_t h = defaultHeight();
+  size_t channels = 4;
+  size_t padding = 0;
+  TestFixture::test_random(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE,
+                           nullptr, padding);
+}
+
 TYPED_TEST(RemapS16Point5, BlendPadding) {
-  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
-  size_t src_h = 4;
-  size_t dst_w = src_w;
-  size_t dst_h = src_h;
-  size_t channels = 1;
-  size_t padding = 13;
+  size_t w = defaultWidth<TypeParam>();
+  size_t h = defaultHeight();
   for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type,
-                            border_value, padding);
+    TestFixture::test_blend(w, h, w, h, 1, border_type, border_value, 13);
   }
 }
 
+TYPED_TEST(RemapS16Point5, BlendPadding4chReplicate) {
+  size_t w = defaultWidth<TypeParam>();
+  size_t h = defaultHeight();
+  size_t channels = 4;
+  size_t padding = 7;
+  TestFixture::test_blend(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE,
+                          nullptr, padding);
+}
+
 TYPED_TEST(RemapS16Point5, OutsideRandomPadding) {
-  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
-  size_t src_h = 4;
-  size_t dst_w = src_w;
-  size_t dst_h = src_h;
-  size_t channels = 1;
-  size_t padding = 13;
+  size_t w = defaultWidth<TypeParam>();
+  size_t h = defaultHeight();
   for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels,
-                                     border_type, border_value, padding);
+    TestFixture::test_outside_random(w, h, w, h, 1, border_type, border_value,
+                                     13);
   }
 }
 
+TYPED_TEST(RemapS16Point5, OutsideRandomPadding4chReplicate) {
+  size_t w = defaultWidth<TypeParam>();
+  size_t h = defaultHeight();
+  size_t channels = 4;
+  size_t padding = 11;
+  TestFixture::test_outside_random(
+      w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr, padding);
+}
+
 TYPED_TEST(RemapS16Point5, BlendBigStride) {
-  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
-  size_t src_h = 16;
+  size_t src_w = defaultWidth<TypeParam>();
+  size_t src_h = defaultHeight();
   size_t dst_w = src_w;
   size_t dst_h = src_h;
   size_t channels = 1;
@@ -698,6 +751,16 @@ TYPED_TEST(RemapS16Point5, BlendBigStride) {
   }
 }
 
+TYPED_TEST(RemapS16Point5, BlendBigStride4chReplicate) {
+  size_t w = defaultWidth<TypeParam>();
+  size_t h = defaultHeight();
+  size_t channels = 4;
+  size_t padding =
+      std::numeric_limits<uint16_t>::max() / channels - w * channels;
+  TestFixture::test_blend(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE,
+                          nullptr, padding);
+}
+
 TYPED_TEST(RemapS16Point5, CornerCases) {
   size_t src_w = std::numeric_limits<int16_t>::max() + 1;
   size_t src_h = std::numeric_limits<int16_t>::max() + 1;
@@ -711,6 +774,18 @@ TYPED_TEST(RemapS16Point5, CornerCases) {
   }
 }
 
+TYPED_TEST(RemapS16Point5, CornerCases4ch) {
+  size_t src_w = 100;
+  size_t src_h = 8;
+  size_t dst_w = defaultWidth<TypeParam>();
+  size_t dst_h = defaultHeight();
+  size_t channels = 4;
+  size_t padding = 17;
+  TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels,
+                                 KLEIDICV_BORDER_TYPE_REPLICATE, nullptr,
+                                 padding);
+}
+
 TYPED_TEST(RemapS16Point5, NullPointer) {
   const TypeParam src[4] = {};
   TypeParam dst[1];
@@ -819,17 +894,20 @@ TYPED_TEST(RemapS16Point5, UnsupportedBigStride) {
                 KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
 }
 
-TYPED_TEST(RemapS16Point5, UnsupportedTwoChannels) {
+TYPED_TEST(RemapS16Point5, UnsupportedChannels) {
   const TypeParam src[1] = {};
   TypeParam dst[8];
   int16_t mapxy[16] = {};
   uint16_t mapfrac[8] = {};
 
-  EXPECT_EQ(
-      KLEIDICV_ERROR_NOT_IMPLEMENTED,
-      remap_s16point5<TypeParam>()(
-          src, 1 * sizeof(TypeParam), 1, 1, dst, 8 * sizeof(TypeParam), 8, 1, 2,
-          mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            remap_s16point5<TypeParam>()(
+                src, 1, 1, 1, dst, 8, 8, 1, 2, mapxy, 4, mapfrac, 2,
+                KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            remap_s16point5<TypeParam>()(
+                src, 1, 1, 1, dst, 8, 8, 1, 3, mapxy, 4, mapfrac, 2,
+                KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
 }
 
 TYPED_TEST(RemapS16Point5, UnsupportedBorderType) {
@@ -844,6 +922,33 @@ TYPED_TEST(RemapS16Point5, UnsupportedBorderType) {
                 1, 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REFLECT, src));
 }
 
+TYPED_TEST(RemapS16Point5, UnsupportedConstantBorder4ch) {
+  const TypeParam src[1] = {};
+  TypeParam dst[8];
+  int16_t mapxy[16] = {};
+  uint16_t mapfrac[8] = {};
+
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            remap_s16point5<TypeParam>()(src, sizeof(TypeParam), 1, 1, dst, 8,
+                                         8, 1, 4, mapxy, 4, mapfrac, 2,
+                                         KLEIDICV_BORDER_TYPE_CONSTANT, src));
+}
+
+TYPED_TEST(RemapS16Point5, UnsupportedBigStride4ch) {
+  const TypeParam src[1] = {};
+  TypeParam dst[8];
+  int16_t mapxy[16] = {};
+  uint16_t mapfrac[8] = {};
+
+  EXPECT_EQ(
+      KLEIDICV_ERROR_NOT_IMPLEMENTED,
+      remap_s16point5<TypeParam>()(
+          src,
+          (std::numeric_limits<uint16_t>::max() / 4 + 1L) * sizeof(TypeParam),
+          1, 1, dst, 8, 8, 1, 4, mapxy, 4, mapfrac, 2,
+          KLEIDICV_BORDER_TYPE_CONSTANT, src));
+}
+
 TYPED_TEST(RemapS16Point5, UnsupportedTooSmallImage) {
   const TypeParam src[1] = {};
   TypeParam dst[8];
diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp
index 9a2b9affa..c16f6356b 100644
--- a/test/api/test_thread.cpp
+++ b/test/api/test_thread.cpp
@@ -722,12 +722,24 @@ TEST_P(Thread, remap_s16point5_u8_border_replicate) {
                                  KLEIDICV_BORDER_TYPE_REPLICATE, nullptr);
 }
 
+TEST_P(Thread, remap_s16point5_u8_border_replicate_4ch) {
+  check_remap_s16point5<uint8_t>(kleidicv_remap_s16point5_u8,
+                                 kleidicv_thread_remap_s16point5_u8, 4,
+                                 KLEIDICV_BORDER_TYPE_REPLICATE, nullptr);
+}
+
 TEST_P(Thread, remap_s16point5_u16_border_replicate) {
   check_remap_s16point5<uint16_t>(kleidicv_remap_s16point5_u16,
                                   kleidicv_thread_remap_s16point5_u16, 1,
                                   KLEIDICV_BORDER_TYPE_REPLICATE, nullptr);
 }
 
+TEST_P(Thread, remap_s16point5_u16_border_replicate_4ch) {
+  check_remap_s16point5<uint16_t>(kleidicv_remap_s16point5_u16,
+                                  kleidicv_thread_remap_s16point5_u16, 4,
+                                  KLEIDICV_BORDER_TYPE_REPLICATE, nullptr);
+}
+
 TEST_P(Thread, remap_s16point5_u8_border_constant) {
   const uint8_t border_value = 0;
   check_remap_s16point5<uint8_t>(kleidicv_remap_s16point5_u8,
-- 
GitLab