diff --git a/CHANGELOG.md b/CHANGELOG.md
index 87be149c01a407430642063093cd953eb5186dda..4d3213a9bcf96834767c50593e15681d6b472457 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,7 +20,7 @@ This changelog aims to follow the guiding principles of
 - Remap implementations with
   - Integer coordinates with nearest neighbour method
   - Fixed-point coordinates with linear interpolation
-  - Floating-point coordinates with linear interpolation
+  - Floating-point coordinates with nearest neighbour and linear interpolation
   - Replicated and constant borders
   - 1-channel only
   - u8 and u16 images
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index c61ef14a4a2b4e59c7c1bfaa4c1ed415600deb12..94f8dd46b5fcdbd7143b347e3ff28e573e8602e1 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -857,30 +857,60 @@ static void remap_f32(Function f, MapFuncX mfx, MapFuncY mfy, size_t channels,
   }                                                                    \
   BENCHMARK(benchname)
 
-BENCH_REMAP_F32(remap_f32_u8_random, remap_f32_u8, get_random_mapx<float>,
-                get_random_mapy<float>, 1, KLEIDICV_INTERPOLATION_LINEAR,
-                KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
+BENCH_REMAP_F32(remap_f32_u8_linear_random, remap_f32_u8,
+                get_random_mapx<float>, get_random_mapy<float>, 1,
+                KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE,
+                uint8_t);
 
-BENCH_REMAP_F32(remap_f32_u8_blend, remap_f32_u8, get_blend_mapx<float>,
+BENCH_REMAP_F32(remap_f32_u8_linear_blend, remap_f32_u8, get_blend_mapx<float>,
                 get_blend_mapy<float>, 1, KLEIDICV_INTERPOLATION_LINEAR,
                 KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
 
-BENCH_REMAP_F32(remap_f32_u8_flip, remap_f32_u8, get_flip_mapx<float>,
+BENCH_REMAP_F32(remap_f32_u8_linear_flip, remap_f32_u8, get_flip_mapx<float>,
                 get_flip_mapy<float>, 1, KLEIDICV_INTERPOLATION_LINEAR,
                 KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
 
-BENCH_REMAP_F32(remap_f32_u16_random, remap_f32_u16, get_random_mapx<float>,
-                get_random_mapy<float>, 1, KLEIDICV_INTERPOLATION_LINEAR,
-                KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t);
+BENCH_REMAP_F32(remap_f32_u16_linear_random, remap_f32_u16,
+                get_random_mapx<float>, get_random_mapy<float>, 1,
+                KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE,
+                uint16_t);
 
-BENCH_REMAP_F32(remap_f32_u16_blend, remap_f32_u16, get_blend_mapx<float>,
-                get_blend_mapy<float>, 1, KLEIDICV_INTERPOLATION_LINEAR,
-                KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t);
+BENCH_REMAP_F32(remap_f32_u16_linear_blend, remap_f32_u16,
+                get_blend_mapx<float>, get_blend_mapy<float>, 1,
+                KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE,
+                uint16_t);
 
-BENCH_REMAP_F32(remap_f32_u16_flip, remap_f32_u16, get_flip_mapx<float>,
+BENCH_REMAP_F32(remap_f32_u16_linear_flip, remap_f32_u16, get_flip_mapx<float>,
                 get_flip_mapy<float>, 1, KLEIDICV_INTERPOLATION_LINEAR,
                 KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t);
 
+BENCH_REMAP_F32(remap_f32_u8_nearest_random, remap_f32_u8,
+                get_random_mapx<float>, get_random_mapy<float>, 1,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
+                uint8_t);
+
+BENCH_REMAP_F32(remap_f32_u8_nearest_blend, remap_f32_u8, get_blend_mapx<float>,
+                get_blend_mapy<float>, 1, KLEIDICV_INTERPOLATION_NEAREST,
+                KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
+
+BENCH_REMAP_F32(remap_f32_u8_nearest_flip, remap_f32_u8, get_flip_mapx<float>,
+                get_flip_mapy<float>, 1, KLEIDICV_INTERPOLATION_NEAREST,
+                KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
+
+BENCH_REMAP_F32(remap_f32_u16_nearest_random, remap_f32_u16,
+                get_random_mapx<float>, get_random_mapy<float>, 1,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
+                uint16_t);
+
+BENCH_REMAP_F32(remap_f32_u16_nearest_blend, remap_f32_u16,
+                get_blend_mapx<float>, get_blend_mapy<float>, 1,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
+                uint16_t);
+
+BENCH_REMAP_F32(remap_f32_u16_nearest_flip, remap_f32_u16, get_flip_mapx<float>,
+                get_flip_mapy<float>, 1, KLEIDICV_INTERPOLATION_NEAREST,
+                KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t);
+
 // clang-format off
 static const float transform_identity[] = {
   1.0, 0, 0,
diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp
index 316ebfdfe2077dc916357be11de6b1fc04b2e294..face699ec1f7f0080db7d6bda130a782bbad2808 100644
--- a/conformity/opencv/test_remap.cpp
+++ b/conformity/opencv/test_remap.cpp
@@ -161,10 +161,29 @@ bool test_remap_f32(int index, RecreatedMessageQueue& request_queue,
     for (size_t h = 5; h <= kMaxHeight * 2; h += 2) {
       cv::Mat map_mat(h * 2, w, CV_32FC1);
       cv::Mat mapx_mat = map_mat.rowRange(0, h);
-      rng.fill(mapx_mat, cv::RNG::UNIFORM, -3, kMaxWidth + 3);
-
       cv::Mat mapy_mat = map_mat.rowRange(h, map_mat.rows);
-      rng.fill(mapy_mat, cv::RNG::UNIFORM, -3, kMaxHeight + 3);
+      for (size_t y = 0; y < h; ++y) {
+        for (size_t x = 0; x < w; ++x) {
+          // Values from -0.49 to 0.49, so exactly 0.5 is excluded
+
+          // Reason: When rounding floating point values to integer, OpenCV does
+          // scalar rounding that works differently based on the rounding
+          // environment. E.g. it can use "Rounding to nearest, ties to even",
+          // while KleidiCV always uses "Rounding to nearest, towards plus
+          // infinity". To prevent these differences, values with exactly 0.5
+          // fractional part are excluded.
+          float divisor = (1.01 * 0x1p32);
+          float epsilon = 0x1p-16;
+          float fractionX = rng.next() / divisor - 0.5F + epsilon;
+          float fractionY = rng.next() / divisor - 0.5F + epsilon;
+          mapx_mat.at<float>(y, x) =
+              (static_cast<int32_t>(rng.next() % (kMaxWidth + 6)) - 3) +
+              fractionX;
+          mapy_mat.at<float>(y, x) =
+              (static_cast<int32_t>(rng.next() % (kMaxHeight + 6)) - 3) +
+              fractionY;
+        }
+      }
 
       cv::Mat actual_mat = exec_remap_f32<ScalarType, Format, Interpolation,
                                           BorderMode, BorderValue>(map_mat);
@@ -235,10 +254,14 @@ std::vector<test>& remap_tests_get() {
     TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
     TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
 
-    TEST("RemapF32 uint8 Replicate", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
-    TEST("RemapF32 uint16 Replicate", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
-    TEST("RemapF32 uint8 Constant", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
-    TEST("RemapF32 uint16 Constant", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>)),
+    TEST("RemapF32 uint8 Replicate Linear", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint16 Replicate Linear", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint8 Constant Linear", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
+    TEST("RemapF32 uint16 Constant Linear", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>)),
+    TEST("RemapF32 uint8 Replicate Nearest", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint16 Replicate Nearest", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint8 Constant Nearest", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>)),
+    TEST("RemapF32 uint16 Constant Nearest", (test_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 123210>), (exec_remap_f32<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 123210>)),
   };
   // clang-format on
   return tests;
diff --git a/doc/opencv.md b/doc/opencv.md
index 2ddae12c6a463fd4781273cd20641abf06ae6fdb..a972dbdacd953e1beca50d51e906df08ac914b16 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -223,7 +223,7 @@ Supported map configurations:
 * `map1` is 32FC1 and `map2` is 32FC1:
   * `map1` is x coordinates (column)
   * `map2` is y coordinates (row)
-  * supported `interpolation`: `INTER_LINEAR` only
+  * supported `interpolation`: `INTER_NEAREST` and `INTER_LINEAR`
 
 ### [`cv::warpPerspective()`](https://docs.opencv.org/4.10.0/da/d54/group__imgproc__transform.html#gaf73673a7e8e18ec6963e3774e6a94b87)
 Performs a perspective transformation on an image.
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index 094cc8c41b33b703bb908df280fc528fbbdcdf28..294414163f20314982b5b31d4fdd90bb1e2f9e98 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1873,6 +1873,7 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u16, const uint16_t *src,
 ///                       sizeof(float)`, except for single-row images.
 /// @param interpolation  Interpolation algorithm. Supported types: \n
 ///                         - @ref KLEIDICV_INTERPOLATION_LINEAR
+///                         - @ref KLEIDICV_INTERPOLATION_NEAREST
 /// @param border_type    Way of handling the border. The supported border types
 ///                       are: \n
 ///                         - @ref KLEIDICV_BORDER_TYPE_REPLICATE
diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h
index 6ef3447152dae74e16403d7808cbc9d99dc6e84d..dcf43bd6401eb902417ae003810a5aefbc19e506 100644
--- a/kleidicv/include/kleidicv/transform/remap.h
+++ b/kleidicv/include/kleidicv/transform/remap.h
@@ -65,7 +65,9 @@ inline bool remap_f32_is_implemented(
             static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) + 1 &&
         (border_type == KLEIDICV_BORDER_TYPE_REPLICATE ||
          border_type == KLEIDICV_BORDER_TYPE_CONSTANT) &&
-        channels == 1 && interpolation == KLEIDICV_INTERPOLATION_LINEAR);
+        channels == 1 &&
+        (interpolation == KLEIDICV_INTERPOLATION_LINEAR ||
+         interpolation == KLEIDICV_INTERPOLATION_NEAREST));
   } else {
     return false;
   }
diff --git a/kleidicv/src/transform/remap_neon.cpp b/kleidicv/src/transform/remap_neon.cpp
index d61561777d79a75d5966324fa814b432855188f9..936d42139b8768a48823fa5c83f6f7842aee579f 100644
--- a/kleidicv/src/transform/remap_neon.cpp
+++ b/kleidicv/src/transform/remap_neon.cpp
@@ -1510,6 +1510,465 @@ class RemapF32ConstantBorder<uint16_t, IsLarge> {
 };  // end of class RemapF32ConstantBorder<uint16_t>
 // NOLINTEND(readability-function-cognitive-complexity)
 
+template <typename ScalarType, bool IsLarge>
+class RemapF32NearestReplicate;
+
+template <bool IsLarge>
+class RemapF32NearestReplicate<uint8_t, IsLarge> {
+ public:
+  using ScalarType = uint8_t;
+  using MapVecTraits = neon::VecTraits<float>;
+  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
+
+  RemapF32NearestReplicate(Rows<const ScalarType> src_rows, size_t src_width,
+                           size_t src_height)
+      : src_rows_{src_rows},
+        v_src_stride_{vdupq_n_u32(static_cast<uint32_t>(src_rows_.stride()))},
+        v_xmax_{vdupq_n_u32(static_cast<uint32_t>(src_width - 1))},
+        v_ymax_{vdupq_n_u32(static_cast<uint32_t>(src_height - 1))} {}
+
+  void get_map_coordinates(Columns<const float> mapx, Columns<const float> mapy,
+                           uint32x4_t &x, uint32x4_t &y) {
+    MapVectorType x_raw = vld1q_f32(&mapx[0]);
+    MapVectorType y_raw = vld1q_f32(&mapy[0]);
+
+    MapVectorType bias = vdupq_n_f32(0.5F);
+    // Round to nearest positive value
+    uint32x4_t x_nearest = vcvtmq_u32_f32(vaddq_f32(x_raw, bias));
+    uint32x4_t y_nearest = vcvtmq_u32_f32(vaddq_f32(y_raw, bias));
+
+    // Clamp coordinates to within the dimensions of the source image
+    x = vmaxq_u32(vdupq_n_u32(0), vminq_u32(x_nearest, v_xmax_));
+    y = vmaxq_u32(vdupq_n_u32(0), vminq_u32(y_nearest, v_ymax_));
+  }
+
+  uint8x8_t load_pixels_large(uint32x4_t x, uint32x4_t y) {
+    // Calculate offsets from coordinates (y * stride + x)
+    uint64x2_t indices_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                  vget_low_u32(v_src_stride_));
+    uint64x2_t indices_high =
+        vmlal_high_u32(vmovl_high_u32(x), y, v_src_stride_);
+
+    uint8x8_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)],
+                        src_rows_[vgetq_lane_u64(indices_low, 1)],
+                        src_rows_[vgetq_lane_u64(indices_high, 0)],
+                        src_rows_[vgetq_lane_u64(indices_high, 1)],
+                        0,
+                        0,
+                        0,
+                        0};
+    return pixels;
+  }
+
+  uint8x8_t load_pixels_small(uint32x4_t x, uint32x4_t y) {
+    // Calculate offsets from coordinates (y * stride + x)
+    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride_);
+
+    uint8x8_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)],
+                        src_rows_[vgetq_lane_u32(indices, 1)],
+                        src_rows_[vgetq_lane_u32(indices, 2)],
+                        src_rows_[vgetq_lane_u32(indices, 3)],
+                        0,
+                        0,
+                        0,
+                        0};
+    return pixels;
+  }
+
+  void store_pixels(uint8x8_t pixels, Columns<ScalarType> dst) {
+    dst[0] = vget_lane_u8(pixels, 0);
+    dst[1] = vget_lane_u8(pixels, 1);
+    dst[2] = vget_lane_u8(pixels, 2);
+    dst[3] = vget_lane_u8(pixels, 3);
+  }
+
+  void process_row(size_t width, Columns<const float> mapx,
+                   Columns<const float> mapy, Columns<ScalarType> dst) {
+    const size_t kStep = VecTraits<float>::num_lanes();
+
+    auto vector_path = [&](size_t step) {
+      uint32x4_t x, y;
+      get_map_coordinates(mapx, mapy, x, y);
+
+      uint8x8_t pixels;
+      if constexpr (IsLarge) {
+        pixels = load_pixels_large(x, y);
+      } else {
+        pixels = load_pixels_small(x, y);
+      }
+
+      store_pixels(pixels, dst);
+
+      mapx += ptrdiff_t(step);
+      mapy += ptrdiff_t(step);
+      dst += ptrdiff_t(step);
+    };
+
+    LoopUnroll loop{width, kStep};
+    loop.unroll_once(vector_path);
+    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
+                          static_cast<ptrdiff_t>(loop.remaining_length());
+    mapx -= back_step;
+    mapy -= back_step;
+    dst -= back_step;
+    loop.remaining([&](size_t, size_t step) { vector_path(step); });
+  }
+
+ private:
+  Rows<const ScalarType> src_rows_;
+  uint32x4_t v_src_stride_;
+  uint32x4_t v_xmax_;
+  uint32x4_t v_ymax_;
+};  // end of class RemapF32NearestReplicate<uint8_t>
+
+template <bool IsLarge>
+class RemapF32NearestReplicate<uint16_t, IsLarge> {
+ public:
+  using ScalarType = uint16_t;
+  using MapVecTraits = neon::VecTraits<float>;
+  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
+
+  RemapF32NearestReplicate(Rows<const ScalarType> src_rows, size_t src_width,
+                           size_t src_height)
+      : src_rows_{src_rows},
+        v_src_element_stride_{vdupq_n_u32(
+            static_cast<uint32_t>(src_rows_.stride() / sizeof(ScalarType)))},
+        v_xmax_{vdupq_n_u32(static_cast<uint32_t>(src_width - 1))},
+        v_ymax_{vdupq_n_u32(static_cast<uint32_t>(src_height - 1))} {}
+
+  void get_map_coordinates(Columns<const float> mapx, Columns<const float> mapy,
+                           uint32x4_t &x, uint32x4_t &y) {
+    MapVectorType x_raw = vld1q_f32(&mapx[0]);
+    MapVectorType y_raw = vld1q_f32(&mapy[0]);
+
+    MapVectorType bias = vdupq_n_f32(0.5F);
+    // Round to nearest positive value
+    uint32x4_t x_nearest = vcvtmq_u32_f32(vaddq_f32(x_raw, bias));
+    uint32x4_t y_nearest = vcvtmq_u32_f32(vaddq_f32(y_raw, bias));
+
+    // Clamp coordinates to within the dimensions of the source image
+    x = vmaxq_u32(vdupq_n_u32(0), vminq_u32(x_nearest, v_xmax_));
+    y = vmaxq_u32(vdupq_n_u32(0), vminq_u32(y_nearest, v_ymax_));
+  }
+
+  uint16x4_t load_pixels_large(uint32x4_t x, uint32x4_t y) {
+    // Calculate offsets from coordinates (y * element_stride + x)
+    uint64x2_t indices_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                  vget_low_u32(v_src_element_stride_));
+    uint64x2_t indices_high =
+        vmlal_high_u32(vmovl_high_u32(x), y, v_src_element_stride_);
+
+    uint16x4_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)],
+                         src_rows_[vgetq_lane_u64(indices_low, 1)],
+                         src_rows_[vgetq_lane_u64(indices_high, 0)],
+                         src_rows_[vgetq_lane_u64(indices_high, 1)]};
+    return pixels;
+  }
+
+  uint16x4_t load_pixels_small(uint32x4_t x, uint32x4_t y) {
+    // Calculate offsets from coordinates (y * element_stride + x)
+    uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride_);
+
+    uint16x4_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)],
+                         src_rows_[vgetq_lane_u32(indices, 1)],
+                         src_rows_[vgetq_lane_u32(indices, 2)],
+                         src_rows_[vgetq_lane_u32(indices, 3)]};
+    return pixels;
+  }
+
+  void store_pixels(uint16x4_t pixels, Columns<ScalarType> dst) {
+    vst1_u16(&dst[0], pixels);
+  }
+
+  void process_row(size_t width, Columns<const float> mapx,
+                   Columns<const float> mapy, Columns<ScalarType> dst) {
+    const size_t kStep = VecTraits<float>::num_lanes();
+
+    auto vector_path = [&](size_t step) {
+      uint32x4_t x, y;
+      get_map_coordinates(mapx, mapy, x, y);
+
+      uint16x4_t pixels;
+      if constexpr (IsLarge) {
+        pixels = load_pixels_large(x, y);
+      } else {
+        pixels = load_pixels_small(x, y);
+      }
+
+      store_pixels(pixels, dst);
+
+      mapx += ptrdiff_t(step);
+      mapy += ptrdiff_t(step);
+      dst += ptrdiff_t(step);
+    };
+
+    LoopUnroll loop{width, kStep};
+    loop.unroll_once(vector_path);
+    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
+                          static_cast<ptrdiff_t>(loop.remaining_length());
+    mapx -= back_step;
+    mapy -= back_step;
+    dst -= back_step;
+    loop.remaining([&](size_t, size_t step) { vector_path(step); });
+  }
+
+ private:
+  Rows<const ScalarType> src_rows_;
+  uint32x4_t v_src_element_stride_;
+  uint32x4_t v_xmax_;
+  uint32x4_t v_ymax_;
+};  // end of class RemapF32NearestReplicate<uint16_t>
+
+template <typename ScalarType, bool IsLarge>
+class RemapF32NearestConstant;
+
+template <bool IsLarge>
+class RemapF32NearestConstant<uint8_t, IsLarge> {
+ public:
+  using ScalarType = uint8_t;
+  using MapVecTraits = neon::VecTraits<float>;
+  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
+
+  RemapF32NearestConstant(Rows<const ScalarType> src_rows, size_t src_width,
+                          size_t src_height, const ScalarType *border_value)
+      : src_rows_{src_rows},
+        v_src_stride_{vdupq_n_u32(static_cast<uint32_t>(src_rows_.stride()))},
+        v_width_{vdupq_n_u32(static_cast<uint32_t>(src_width))},
+        v_height_{vdupq_n_u32(static_cast<uint32_t>(src_height))},
+        v_border_{vdup_n_u8(*border_value)} {}
+
+  void get_map_coordinates(Columns<const float> mapx, Columns<const float> mapy,
+                           uint32x4_t &x, uint32x4_t &y, uint32x4_t &in_range) {
+    MapVectorType x_raw = vld1q_f32(&mapx[0]);
+    MapVectorType y_raw = vld1q_f32(&mapy[0]);
+
+    MapVectorType bias = vdupq_n_f32(0.5F);
+    float32x4_t x_biased = vaddq_f32(x_raw, bias);
+    float32x4_t y_biased = vaddq_f32(y_raw, bias);
+
+    // Round to nearest positive value
+    uint32x4_t x_nearest = vcvtmq_u32_f32(x_biased);
+    uint32x4_t y_nearest = vcvtmq_u32_f32(y_biased);
+
+    // Find whether coordinates are within the image dimensions.
+    uint32x4_t above_zero =
+        vandq_u32(vcgezq_f32(x_biased), vcgezq_f32(y_biased));
+    uint32x4_t below_limits = vandq_u32(vcltq_u32(x_nearest, v_width_),
+                                        vcltq_u32(y_nearest, v_height_));
+    in_range = vandq_u32(above_zero, below_limits);
+
+    // Zero out-of-range coordinates.
+    x = vandq_u32(in_range, x_nearest);
+    y = vandq_u32(in_range, y_nearest);
+  }
+
+  uint8x8_t load_pixels_large(uint32x4_t x, uint32x4_t y) {
+    // Calculate offsets from coordinates (y * stride + x)
+    uint64x2_t indices_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                  vget_low_u32(v_src_stride_));
+    uint64x2_t indices_high =
+        vmlal_high_u32(vmovl_high_u32(x), y, v_src_stride_);
+
+    uint8x8_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)],
+                        src_rows_[vgetq_lane_u64(indices_low, 1)],
+                        src_rows_[vgetq_lane_u64(indices_high, 0)],
+                        src_rows_[vgetq_lane_u64(indices_high, 1)],
+                        0,
+                        0,
+                        0,
+                        0};
+    return pixels;
+  }
+
+  uint8x8_t load_pixels_small(uint32x4_t x, uint32x4_t y) {
+    // Calculate offsets from coordinates (y * stride + x)
+    uint32x4_t indices = vmlaq_u32(x, y, v_src_stride_);
+
+    uint8x8_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)],
+                        src_rows_[vgetq_lane_u32(indices, 1)],
+                        src_rows_[vgetq_lane_u32(indices, 2)],
+                        src_rows_[vgetq_lane_u32(indices, 3)],
+                        0,
+                        0,
+                        0,
+                        0};
+    return pixels;
+  }
+
+  void store_pixels(uint8x8_t pixels, Columns<ScalarType> dst) {
+    dst[0] = vget_lane_u8(pixels, 0);
+    dst[1] = vget_lane_u8(pixels, 1);
+    dst[2] = vget_lane_u8(pixels, 2);
+    dst[3] = vget_lane_u8(pixels, 3);
+  }
+
+  void process_row(size_t width, Columns<const float> mapx,
+                   Columns<const float> mapy, Columns<ScalarType> dst) {
+    const size_t kStep = VecTraits<float>::num_lanes();
+
+    auto vector_path = [&](size_t step) {
+      uint32x4_t x, y;
+      uint32x4_t in_range;
+      get_map_coordinates(mapx, mapy, x, y, in_range);
+
+      uint8x8_t pixels;
+      if constexpr (IsLarge) {
+        pixels = load_pixels_large(x, y);
+      } else {
+        pixels = load_pixels_small(x, y);
+      }
+
+      // Select between source pixels and border colour
+      uint8x8_t in_range_narrowed =
+          vmovn_u16(vcombine_u16(vmovn_u32(in_range), vdup_n_u16(0)));
+      uint8x8_t pixels_or_border =
+          vbsl_u8(in_range_narrowed, pixels, v_border_);
+
+      store_pixels(pixels_or_border, dst);
+
+      mapx += ptrdiff_t(step);
+      mapy += ptrdiff_t(step);
+      dst += ptrdiff_t(step);
+    };
+
+    LoopUnroll loop{width, kStep};
+    loop.unroll_once(vector_path);
+    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
+                          static_cast<ptrdiff_t>(loop.remaining_length());
+    mapx -= back_step;
+    mapy -= back_step;
+    dst -= back_step;
+    loop.remaining([&](size_t, size_t step) { vector_path(step); });
+  }
+
+ private:
+  Rows<const ScalarType> src_rows_;
+  uint32x4_t v_src_stride_;
+  uint32x4_t v_width_;
+  uint32x4_t v_height_;
+  uint8x8_t v_border_;
+};  // end of class RemapF32NearestConstant<uint8_t>
+
+template <bool IsLarge>
+class RemapF32NearestConstant<uint16_t, IsLarge> {
+ public:
+  using ScalarType = uint16_t;
+  using MapVecTraits = neon::VecTraits<float>;
+  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
+
+  RemapF32NearestConstant(Rows<const ScalarType> src_rows, size_t src_width,
+                          size_t src_height, const ScalarType *border_value)
+      : src_rows_{src_rows},
+        v_src_element_stride_{vdupq_n_u32(
+            static_cast<uint32_t>(src_rows_.stride() / sizeof(ScalarType)))},
+        v_width_{vdupq_n_u32(static_cast<uint32_t>(src_width))},
+        v_height_{vdupq_n_u32(static_cast<uint32_t>(src_height))},
+        v_border_{vdup_n_u16(*border_value)} {}
+
+  void get_map_coordinates(Columns<const float> mapx, Columns<const float> mapy,
+                           uint32x4_t &x, uint32x4_t &y, uint32x4_t &in_range) {
+    MapVectorType x_raw = vld1q_f32(&mapx[0]);
+    MapVectorType y_raw = vld1q_f32(&mapy[0]);
+
+    MapVectorType bias = vdupq_n_f32(0.5F);
+    float32x4_t x_biased = vaddq_f32(x_raw, bias);
+    float32x4_t y_biased = vaddq_f32(y_raw, bias);
+
+    // Round to nearest positive value
+    uint32x4_t x_nearest = vcvtmq_u32_f32(x_biased);
+    uint32x4_t y_nearest = vcvtmq_u32_f32(y_biased);
+
+    // Find whether coordinates are within the image dimensions.
+    uint32x4_t above_zero =
+        vandq_u32(vcgezq_f32(x_biased), vcgezq_f32(y_biased));
+    uint32x4_t below_limits = vandq_u32(vcltq_u32(x_nearest, v_width_),
+                                        vcltq_u32(y_nearest, v_height_));
+    in_range = vandq_u32(above_zero, below_limits);
+
+    // Zero out-of-range coordinates.
+    x = vandq_u32(in_range, x_nearest);
+    y = vandq_u32(in_range, y_nearest);
+  }
+
+  uint16x4_t load_pixels_large(uint32x4_t x, uint32x4_t y) {
+    // Calculate offsets from coordinates (y * stride + x)
+    uint64x2_t indices_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y),
+                  vget_low_u32(v_src_element_stride_));
+    uint64x2_t indices_high =
+        vmlal_high_u32(vmovl_high_u32(x), y, v_src_element_stride_);
+
+    uint16x4_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)],
+                         src_rows_[vgetq_lane_u64(indices_low, 1)],
+                         src_rows_[vgetq_lane_u64(indices_high, 0)],
+                         src_rows_[vgetq_lane_u64(indices_high, 1)]};
+    return pixels;
+  }
+
+  uint16x4_t load_pixels_small(uint32x4_t x, uint32x4_t y) {
+    // Calculate offsets from coordinates (y * stride + x)
+    uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride_);
+
+    uint16x4_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)],
+                         src_rows_[vgetq_lane_u32(indices, 1)],
+                         src_rows_[vgetq_lane_u32(indices, 2)],
+                         src_rows_[vgetq_lane_u32(indices, 3)]};
+    return pixels;
+  }
+
+  void store_pixels(uint16x4_t pixels, Columns<ScalarType> dst) {
+    vst1_u16(&dst[0], pixels);
+  }
+
+  void process_row(size_t width, Columns<const float> mapx,
+                   Columns<const float> mapy, Columns<ScalarType> dst) {
+    const size_t kStep = VecTraits<float>::num_lanes();
+
+    auto vector_path = [&](size_t step) {
+      uint32x4_t x, y;
+      uint32x4_t in_range;
+      get_map_coordinates(mapx, mapy, x, y, in_range);
+
+      uint16x4_t pixels;
+      if constexpr (IsLarge) {
+        pixels = load_pixels_large(x, y);
+      } else {
+        pixels = load_pixels_small(x, y);
+      }
+
+      // Select between source pixels and border colour
+      uint16x4_t in_range_narrowed = vmovn_u32(in_range);
+      uint16x4_t pixels_or_border =
+          vbsl_u16(in_range_narrowed, pixels, v_border_);
+
+      store_pixels(pixels_or_border, dst);
+
+      mapx += ptrdiff_t(step);
+      mapy += ptrdiff_t(step);
+      dst += ptrdiff_t(step);
+    };
+
+    LoopUnroll loop{width, kStep};
+    loop.unroll_once(vector_path);
+    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
+                          static_cast<ptrdiff_t>(loop.remaining_length());
+    mapx -= back_step;
+    mapy -= back_step;
+    dst -= back_step;
+    loop.remaining([&](size_t, size_t step) { vector_path(step); });
+  }
+
+ private:
+  Rows<const ScalarType> src_rows_;
+  uint32x4_t v_src_element_stride_;
+  uint32x4_t v_width_;
+  uint32x4_t v_height_;
+  uint16x4_t v_border_;
+};  // end of class RemapF32NearestConstant<uint16_t>
+
 // Most of the complexity comes from parameter checking.
 // NOLINTBEGIN(readability-function-cognitive-complexity)
 template <typename T>
@@ -1549,24 +2008,50 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
   Rows<T> dst_rows{dst, dst_stride, channels};
   Rectangle rect{dst_width, dst_height};
 
-  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
-    if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
-      RemapF32ConstantBorder<T, true> operation{src_rows, src_width, src_height,
-                                                border_value};
-      zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+  if (interpolation == KLEIDICV_INTERPOLATION_LINEAR) {
+    if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
+      if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
+        RemapF32ConstantBorder<T, true> operation{src_rows, src_width,
+                                                  src_height, border_value};
+        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+      } else {
+        RemapF32ConstantBorder<T, false> operation{src_rows, src_width,
+                                                   src_height, border_value};
+        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+      }
     } else {
-      RemapF32ConstantBorder<T, false> operation{src_rows, src_width,
-                                                 src_height, border_value};
-      zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+      assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
+      if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
+        RemapF32Replicate<T, true> operation{src_rows, src_width, src_height};
+        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+      } else {
+        RemapF32Replicate<T, false> operation{src_rows, src_width, src_height};
+        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+      }
     }
   } else {
-    assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
-    if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
-      RemapF32Replicate<T, true> operation{src_rows, src_width, src_height};
-      zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+    assert(interpolation == KLEIDICV_INTERPOLATION_NEAREST);
+    if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
+      if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
+        RemapF32NearestConstant<T, true> operation{src_rows, src_width,
+                                                   src_height, border_value};
+        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+      } else {
+        RemapF32NearestConstant<T, false> operation{src_rows, src_width,
+                                                    src_height, border_value};
+        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+      }
     } else {
-      RemapF32Replicate<T, false> operation{src_rows, src_width, src_height};
-      zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+      assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
+      if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
+        RemapF32NearestReplicate<T, true> operation{src_rows, src_width,
+                                                    src_height};
+        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+      } else {
+        RemapF32NearestReplicate<T, false> operation{src_rows, src_width,
+                                                     src_height};
+        zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+      }
     }
   }
 
diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h
index dc52a2931fac060de7e058df5991108366080cea..a0e27f1696bb2b418d7def0c95c478ada2664520 100644
--- a/kleidicv/src/transform/remap_sc.h
+++ b/kleidicv/src/transform/remap_sc.h
@@ -822,6 +822,127 @@ kleidicv_error_t remap_s16point5_sc(
   return KLEIDICV_OK;
 }
 
+template <typename ScalarType, bool IsLarge, kleidicv_border_type_t Border>
+void remap32f_nearest(svuint32_t sv_xmax, svuint32_t sv_ymax,
+                      svuint32_t sv_src_stride, Rows<const ScalarType> src_rows,
+                      svuint32_t sv_border, Columns<ScalarType> dst,
+                      size_t kStep, size_t dst_width,
+                      Rows<const float> mapx_rows,
+                      Rows<const float> mapy_rows) {
+  svbool_t pg_all32 = svptrue_b32();
+  auto load_coords = [&](svbool_t pg, size_t xs) {
+    auto x = static_cast<ptrdiff_t>(xs);
+    return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]),
+                     svld1_f32(pg, &mapy_rows.as_columns()[x]));
+  };
+
+  auto get_pixels = [&](svbool_t pg, svuint32x2_t coords) {
+    svuint32_t x = svget2(coords, 0);
+    svuint32_t y = svget2(coords, 1);
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+      svbool_t in_range = svand_b_z(pg, svcmple_u32(pg, x, sv_xmax),
+                                    svcmple_u32(pg, y, sv_ymax));
+      svuint32_t result = load_common<ScalarType, IsLarge>(
+          in_range, x, y, sv_src_stride, src_rows);
+      // Select between source pixels and border colour
+      return svsel_u32(in_range, result, sv_border);
+    } else {
+      static_assert(Border == KLEIDICV_BORDER_TYPE_REPLICATE);
+      return load_common<ScalarType, IsLarge>(pg, x, y, sv_src_stride,
+                                              src_rows);
+    }
+  };
+
+  auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) {
+    svfloat32x2_t coords = load_coords(pg32, x);
+    svfloat32_t xf = svget2(coords, 0);
+    svfloat32_t yf = svget2(coords, 1);
+
+    svuint32_t xi, yi;
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+      // Round to the nearest integer
+      xi = svreinterpret_u32_s32(
+          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf)));
+      yi = svreinterpret_u32_s32(
+          svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf)));
+    } else {
+      // Round to the nearest integer, clamp it to within the dimensions of
+      // the source image (negative values are already saturated to 0)
+      xi = svmin_x(pg_all32,
+                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)),
+                   sv_xmax);
+      yi = svmin_x(pg_all32,
+                   svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)),
+                   sv_ymax);
+    }
+    return svcreate2(xi, yi);
+  };
+
+  LoopUnroll2 loop{dst_width, kStep};
+
+  if constexpr (std::is_same<ScalarType, uint8_t>::value) {
+    auto vector_path_generic = [&](size_t x, size_t x_max,
+                                   Columns<ScalarType> dst) {
+      size_t length = x_max - x;
+      svbool_t pg32 = svwhilelt_b32(0ULL, length);
+      svuint32_t result =
+          get_pixels(pg32, calculate_nearest_coordinates(pg32, x));
+      svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
+    };
+
+    loop.unroll_four_times([&](size_t x) {
+      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+      svuint32_t res32_0 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      x += kStep;
+      svuint32_t res32_1 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                      svreinterpret_u16_u32(res32_1));
+      x += kStep;
+      res32_0 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      x += kStep;
+      res32_1 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                      svreinterpret_u16_u32(res32_1));
+      svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0),
+                                   svreinterpret_u8_u16(result1));
+      svst1(svptrue_b8(), p_dst, result);
+    });
+    loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); });
+    loop.remaining(
+        [&](size_t x, size_t length) { vector_path_generic(x, length, dst); });
+  }
+
+  if constexpr (std::is_same<ScalarType, uint16_t>::value) {
+    auto vector_path_generic = [&](size_t x, size_t x_max,
+                                   Columns<ScalarType> dst) {
+      size_t length = x_max - x;
+      svbool_t pg32 = svwhilelt_b32(0ULL, length);
+      svuint32_t result =
+          get_pixels(pg32, calculate_nearest_coordinates(pg32, x));
+      svst1h_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
+    };
+
+    loop.unroll_twice([&](size_t x) {
+      ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+      svuint32_t res32_0 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      x += kStep;
+      svuint32_t res32_1 =
+          get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x));
+      svuint16_t result = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                     svreinterpret_u16_u32(res32_1));
+      svst1(svptrue_b16(), p_dst, result);
+    });
+    loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); });
+    loop.remaining(
+        [&](size_t x, size_t length) { vector_path_generic(x, length, dst); });
+  }
+}
+
 // TODO reduce functional complexity
 template <typename ScalarType, bool IsLarge,
           kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
@@ -835,10 +956,8 @@ void remap32f_process_rows(Rows<const ScalarType> src_rows, size_t src_width,
   svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
   svuint32_t sv_ymax = svdup_n_u32(src_height - 1);
   svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride());
-  svuint32_t sv_border;
-  // sv_border is only used if the border type is constant.
-  // If the border type is not constant then border_value is permitted to be
-  // null and must not be read.
+  svuint32_t sv_border = svdup_n_u32(0);
+
   if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
     sv_border = svdup_n_u32(border_value[0]);
   }
@@ -868,13 +987,13 @@ void remap32f_process_rows(Rows<const ScalarType> src_rows, size_t src_width,
     }
   };
 
-  auto process_row = [&]() {
+  for (size_t y = y_begin; y < y_end; ++y) {
     Columns<ScalarType> dst = dst_rows.as_columns();
     LoopUnroll2 loop{dst_width, kStep};
-    // GCOVR_EXCL_START
     if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
-      assert(!"INTER_NEAREST not implemented for RemapF32");
-      // GCOVR_EXCL_STOP
+      remap32f_nearest<ScalarType, IsLarge, Border>(
+          sv_xmax, sv_ymax, sv_src_stride, src_rows, sv_border, dst, kStep,
+          dst_width, mapx_rows, mapy_rows);
     } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) {
       if constexpr (std::is_same<ScalarType, uint8_t>::value) {
         loop.unroll_four_times([&](size_t x) {
@@ -933,10 +1052,6 @@ void remap32f_process_rows(Rows<const ScalarType> src_rows, size_t src_width,
     }
     ++mapx_rows;
     ++mapy_rows;
-  };
-
-  for (size_t y = y_begin; y < y_end; ++y) {
-    process_row();
     ++dst_rows;
   }
 }
diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt
index a8dc74e71b77a5c8d5341ab932c665242e3430c1..0625b6bd37e0eff38df8fcb15fa7a874b728551e 100755
--- a/scripts/benchmark/benchmarks.txt
+++ b/scripts/benchmark/benchmarks.txt
@@ -86,10 +86,14 @@ Remap_S16Point5_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8U
 Remap_S16Point5_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)'
 Remap_S16Point5_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)'
 Remap_S16Point5_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)'
-Remap_F32_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)'
-Remap_F32_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)'
-Remap_F32_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)'
-Remap_F32_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)'
+Remap_F32_U8_Replicate_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)'
+Remap_F32_U8_Constant_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)'
+Remap_F32_U16_Replicate_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)'
+Remap_F32_U16_Constant_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)'
+Remap_F32_U8_Replicate_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_REPLICATE)'
+Remap_F32_U8_Constant_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_CONSTANT)'
+Remap_F32_U16_Replicate_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_NEAREST, BORDER_REPLICATE)'
+Remap_F32_U16_Constant_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_NEAREST, BORDER_CONSTANT)'
 
 WarpPerspective_Nearest: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 1)'
 WarpPerspective_Linear: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 1)'
diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp
index 5966dd191bc76db3f4f91bbc86d937c9783ff2fb..7a18a0bcf113bde7e967345fc3dd263d57ef25d2 100644
--- a/test/api/test_remap.cpp
+++ b/test/api/test_remap.cpp
@@ -863,20 +863,23 @@ class RemapF32 : public testing::Test {
   static void test_random(size_t src_w, size_t src_h, size_t dst_w,
                           size_t dst_h, size_t channels,
                           kleidicv_border_type_t border_type,
-                          const ScalarType *border_value, size_t padding) {
+                          const ScalarType *border_value,
+                          kleidicv_interpolation_type_t interpolation,
+                          size_t padding) {
     test::PseudoRandomNumberGenerator<float> coord_generator;
     test::Array2D<float> mapx(dst_w, dst_h, padding);
     test::Array2D<float> mapy(dst_w, dst_h, padding);
     mapx.fill(coord_generator);
     mapy.fill(coord_generator);
     execute_test(mapx, mapy, src_w, src_h, dst_w, dst_h, channels, border_type,
-                 border_value, padding);
+                 border_value, interpolation, padding);
   }
 
   static void test_outside_random(size_t src_w, size_t src_h, size_t dst_w,
                                   size_t dst_h, size_t channels,
                                   kleidicv_border_type_t border_type,
                                   const ScalarType *border_value,
+                                  kleidicv_interpolation_type_t interpolation,
                                   size_t padding) {
     test::Array2D<float> mapx(dst_w, dst_h, padding);
     test::PseudoRandomNumberGeneratorFloatRange<float> xcoord_generator{
@@ -891,12 +894,14 @@ class RemapF32 : public testing::Test {
         static_cast<float>(2 * src_h)};
     mapy.fill(ycoord_generator);
     execute_test(mapx, mapy, src_w, src_h, dst_w, dst_h, channels, border_type,
-                 border_value, padding);
+                 border_value, interpolation, padding);
   }
 
   static void test_blend(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h,
                          size_t channels, kleidicv_border_type_t border_type,
-                         const ScalarType *border_value, size_t padding) {
+                         const ScalarType *border_value,
+                         kleidicv_interpolation_type_t interpolation,
+                         size_t padding) {
     test::Array2D<float> mapx(dst_w, dst_h, padding);
     test::Array2D<float> mapy(dst_w, dst_h, padding);
     for (size_t row = 0; row < dst_h; ++row) {
@@ -913,7 +918,7 @@ class RemapF32 : public testing::Test {
       }
     }
     execute_test(mapx, mapy, src_w, src_h, dst_w, dst_h, channels, border_type,
-                 border_value, padding);
+                 border_value, interpolation, padding);
   }
 
   // Test coordinates with edge values that may easily overflow
@@ -921,6 +926,7 @@ class RemapF32 : public testing::Test {
                                 size_t dst_h, size_t channels,
                                 kleidicv_border_type_t border_type,
                                 const ScalarType *border_value,
+                                kleidicv_interpolation_type_t interpolation,
                                 size_t padding) {
     test::Array2D<float> mapx(dst_w, dst_h, padding);
     test::Array2D<float> mapy(dst_w, dst_h, padding);
@@ -991,7 +997,8 @@ class RemapF32 : public testing::Test {
     test::PseudoRandomNumberGenerator<ScalarType> generator;
     actual.fill(42);
 
-    calculate_expected(source, mapx, mapy, border_type, border_value, expected);
+    calculate_expected(source, mapx, mapy, border_type, border_value,
+                       interpolation, expected);
 
     ASSERT_EQ(
         KLEIDICV_OK,
@@ -999,7 +1006,7 @@ class RemapF32 : public testing::Test {
             source.data(), source.stride(), source.width(), source.height(),
             actual.data(), actual.stride(), actual.width(), actual.height(),
             channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(),
-            KLEIDICV_INTERPOLATION_LINEAR, border_type, border_value));
+            interpolation, border_type, border_value));
 
     if (expected.compare_to(actual, 1)) {
       if (source.width() < 100 && source.height() < 100) {
@@ -1024,7 +1031,9 @@ class RemapF32 : public testing::Test {
                            test::Array2D<float> &mapy, size_t src_w,
                            size_t src_h, size_t dst_w, size_t dst_h,
                            size_t channels, kleidicv_border_type_t border_type,
-                           const ScalarType *border_value, size_t padding) {
+                           const ScalarType *border_value,
+                           kleidicv_interpolation_type_t interpolation,
+                           size_t padding) {
     size_t src_total_width = channels * src_w;
     size_t dst_total_width = channels * dst_w;
 
@@ -1036,7 +1045,8 @@ class RemapF32 : public testing::Test {
     source.fill(generator);
     actual.fill(42);
 
-    calculate_expected(source, mapx, mapy, border_type, border_value, expected);
+    calculate_expected(source, mapx, mapy, border_type, border_value,
+                       interpolation, expected);
 
     ASSERT_EQ(
         KLEIDICV_OK,
@@ -1044,7 +1054,7 @@ class RemapF32 : public testing::Test {
             source.data(), source.stride(), source.width(), source.height(),
             actual.data(), actual.stride(), actual.width(), actual.height(),
             channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(),
-            KLEIDICV_INTERPOLATION_LINEAR, border_type, border_value));
+            interpolation, border_type, border_value));
 
     if (expected.compare_to(actual, 1)) {
       if (source.width() < 100 && source.height() < 100) {
@@ -1069,6 +1079,7 @@ class RemapF32 : public testing::Test {
                                  test::Array2D<float> &mapy,
                                  kleidicv_border_type_t border_type,
                                  const ScalarType *border_value,
+                                 kleidicv_interpolation_type_t interpolation,
                                  test::Array2D<ScalarType> &expected) {
     auto get_src = [&](ptrdiff_t x, ptrdiff_t y) {
       return get_array2d_element_or_border(src, x, y, border_type,
@@ -1078,9 +1089,10 @@ class RemapF32 : public testing::Test {
     for (size_t row = 0; row < expected.height(); row++) {
       for (size_t column = 0; column < expected.width() / src.channels();
            ++column) {
-        for (size_t ch = 0; ch < src.channels(); ++ch) {
-          float x = *mapx.at(row, column);
-          float y = *mapy.at(row, column);
+        float x = *mapx.at(row, column);
+        float y = *mapy.at(row, column);
+
+        if (interpolation == KLEIDICV_INTERPOLATION_LINEAR) {
           ptrdiff_t ix = static_cast<ptrdiff_t>(std::max<float>(
               INT_MIN,
               std::min<float>(std::floor(x),
@@ -1091,15 +1103,31 @@ class RemapF32 : public testing::Test {
                               static_cast<float>(KLEIDICV_MAX_IMAGE_PIXELS))));
           float xfrac = x - std::floor(x);
           float yfrac = y - std::floor(y);
-          float a = get_src(ix, iy)[ch];
-          float b = get_src(ix + 1, iy)[ch];
-          float c = get_src(ix, iy + 1)[ch];
-          float d = get_src(ix + 1, iy + 1)[ch];
-          float line1 = (b - a) * xfrac + a;
-          float line2 = (d - c) * xfrac + c;
-          float float_result = (line2 - line1) * yfrac + line1;
-          *expected.at(row, column * src.channels() + ch) =
-              static_cast<ScalarType>(std::lround(float_result));
+          for (size_t ch = 0; ch < src.channels(); ++ch) {
+            float a = get_src(ix, iy)[ch];
+            float b = get_src(ix + 1, iy)[ch];
+            float c = get_src(ix, iy + 1)[ch];
+            float d = get_src(ix + 1, iy + 1)[ch];
+            float line1 = (b - a) * xfrac + a;
+            float line2 = (d - c) * xfrac + c;
+            float float_result = (line2 - line1) * yfrac + line1;
+            *expected.at(row, column * src.channels() + ch) =
+                static_cast<ScalarType>(std::lround(float_result));
+          }
+        } else {
+          assert(interpolation == KLEIDICV_INTERPOLATION_NEAREST);
+          ptrdiff_t ix = static_cast<ptrdiff_t>(std::max<float>(
+              INT_MIN,
+              std::min<float>(std::round(x),
+                              static_cast<float>(KLEIDICV_MAX_IMAGE_PIXELS))));
+          ptrdiff_t iy = static_cast<ptrdiff_t>(std::max<float>(
+              INT_MIN,
+              std::min<float>(std::round(y),
+                              static_cast<float>(KLEIDICV_MAX_IMAGE_PIXELS))));
+          for (size_t ch = 0; ch < src.channels(); ++ch) {
+            *expected.at(row, column * src.channels() + ch) =
+                get_src(ix, iy)[ch];
+          }
         }
       }
     }
@@ -1117,8 +1145,12 @@ TYPED_TEST(RemapF32, RandomNoPadding) {
   size_t channels = 1;
   size_t padding = 0;
   for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels, border_type,
-                             border_value, padding);
+    for (auto interpolation :
+         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+      TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels,
+                               border_type, border_value, interpolation,
+                               padding);
+    }
   }
 }
 
@@ -1130,8 +1162,11 @@ TYPED_TEST(RemapF32, BlendPadding) {
   size_t channels = 1;
   size_t padding = 13;
   for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type,
-                            border_value, padding);
+    for (auto interpolation :
+         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+      TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type,
+                              border_value, interpolation, padding);
+    }
   }
 }
 
@@ -1143,8 +1178,12 @@ TYPED_TEST(RemapF32, OutsideRandomPadding) {
   size_t channels = 1;
   size_t padding = 13;
   for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels,
-                                     border_type, border_value, padding);
+    for (auto interpolation :
+         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+      TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels,
+                                       border_type, border_value, interpolation,
+                                       padding);
+    }
   }
 }
 
@@ -1156,8 +1195,11 @@ TYPED_TEST(RemapF32, BlendBigStride) {
   size_t channels = 1;
   size_t padding = 1 << 16;
   for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type,
-                            border_value, padding);
+    for (auto interpolation :
+         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+      TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type,
+                              border_value, interpolation, padding);
+    }
   }
 }
 
@@ -1169,8 +1211,12 @@ TYPED_TEST(RemapF32, CornerCases) {
   size_t channels = 1;
   size_t padding = 17;
   for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels,
-                                   border_type, border_value, padding);
+    for (auto interpolation :
+         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+      TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels,
+                                     border_type, border_value, interpolation,
+                                     padding);
+    }
   }
 }
 
@@ -1184,8 +1230,12 @@ TYPED_TEST(RemapF32, CornerCasesLargeLoad) {
   size_t channels = 1;
   size_t padding = 1;
   for (auto [border_type, border_value] : get_borders<TypeParam>()) {
-    TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels,
-                                   border_type, border_value, padding);
+    for (auto interpolation :
+         {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) {
+      TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels,
+                                     border_type, border_value, interpolation,
+                                     padding);
+    }
   }
 }
 
@@ -1224,6 +1274,7 @@ TYPED_TEST(RemapF32, ZeroHeightImage) {
   const size_t mapx_stride = kW * sizeof(float);
   const size_t mapy_stride = kW * sizeof(float);
 
+  // TODO: Why these sets of parameters?
   for (auto [border_type, border_value] : get_borders<TypeParam>()) {
     EXPECT_EQ(KLEIDICV_OK,
               remap_f32<TypeParam>()(src, src_stride, kW, 1, dst, dst_stride,
@@ -1251,6 +1302,7 @@ TYPED_TEST(RemapF32, InvalidImageSize) {
   float mapx[1] = {};
   float mapy[1] = {};
 
+  // TODO: Why these sets of parameters?
   EXPECT_EQ(
       KLEIDICV_ERROR_RANGE,
       remap_f32<TypeParam>()(src, element_size, KLEIDICV_MAX_IMAGE_PIXELS + 1,
@@ -1297,21 +1349,6 @@ TYPED_TEST(RemapF32, UnsupportedTwoChannels) {
                              KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
 }
 
-TYPED_TEST(RemapF32, UnsupportedInterpolationTypeNEAREST) {
-  const size_t element_size = sizeof(TypeParam);
-  const TypeParam src[1] = {};
-  TypeParam dst[16];
-  float mapx[16] = {};
-  float mapy[16] = {};
-
-  EXPECT_EQ(
-      KLEIDICV_ERROR_NOT_IMPLEMENTED,
-      remap_f32<TypeParam>()(src, element_size, 1, 1, dst, 16 * element_size,
-                             16, 1, 1, mapx, 16 * sizeof(float), mapy,
-                             16 * sizeof(float), KLEIDICV_INTERPOLATION_NEAREST,
-                             KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
-}
-
 TYPED_TEST(RemapF32, UnsupportedTooSmallImage) {
   const size_t element_size = sizeof(TypeParam);
   const TypeParam src[1] = {};
diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp
index c1a53f6daeb1b9d815bbea307f99049a34a8870a..128cf7af7209275f708135dd0f1d263008231654 100644
--- a/test/api/test_thread.cpp
+++ b/test/api/test_thread.cpp
@@ -780,9 +780,6 @@ TEST_P(Thread, remap_f32_u8_not_implemented) {
   check_remap_f32_not_implemented<uint8_t>(
       kleidicv_thread_remap_f32_u8, 2, KLEIDICV_INTERPOLATION_LINEAR,
       KLEIDICV_BORDER_TYPE_REPLICATE, &border_value);
-  check_remap_f32_not_implemented<uint8_t>(
-      kleidicv_thread_remap_f32_u8, 1, KLEIDICV_INTERPOLATION_NEAREST,
-      KLEIDICV_BORDER_TYPE_REPLICATE, &border_value);
   check_remap_f32_not_implemented<uint8_t>(
       kleidicv_thread_remap_f32_u8, 1, KLEIDICV_INTERPOLATION_LINEAR,
       KLEIDICV_BORDER_TYPE_REFLECT, &border_value);
@@ -807,9 +804,6 @@ TEST_P(Thread, remap_f32_u16_not_implemented) {
   check_remap_f32_not_implemented<uint16_t>(
       kleidicv_thread_remap_f32_u16, 2, KLEIDICV_INTERPOLATION_LINEAR,
       KLEIDICV_BORDER_TYPE_REPLICATE, &border_value);
-  check_remap_f32_not_implemented<uint16_t>(
-      kleidicv_thread_remap_f32_u16, 1, KLEIDICV_INTERPOLATION_NEAREST,
-      KLEIDICV_BORDER_TYPE_REPLICATE, &border_value);
   check_remap_f32_not_implemented<uint16_t>(
       kleidicv_thread_remap_f32_u16, 1, KLEIDICV_INTERPOLATION_LINEAR,
       KLEIDICV_BORDER_TYPE_REFLECT, &border_value);