diff --git a/CHANGELOG.md b/CHANGELOG.md
index 64273414c16d7aaf3f1bc60e45c65cdb288f5e10..d55cb571f2a9eae8e7efbfa927efc93595b1be29 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,8 +18,10 @@ This changelog aims to follow the guiding principles of
 ### Added
 - Implementation of Rotate 90 degrees clockwise.
 - Remap implementations with
+  - Integer coordinates with nearest neighbour method
+  - Fixed-point coordinates with linear interpolation
+  - Floating-point coordinates with linear interpolation
   - Replicated and constant borders
-  - Nearest neighbour and fixed-point interpolations
   - 1-channel only
   - u8 and u16 inputs
 - WarpPerspective implementation
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index addbd7281f88ffad983423049f78525fe90f6c20..c5b82b23ba08247482a0768d10bbc6490401dbc7 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -1377,6 +1377,38 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step,
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
+int remap_f32(int src_type, const uchar *src_data, size_t src_step,
+              int src_width, int src_height, uchar *dst_data, size_t dst_step,
+              int dst_width, int dst_height, float *mapx, size_t mapx_step,
+              float *mapy, size_t mapy_step, int interpolation, int border_type,
+              const double border_value_f64[4]) {
+  kleidicv_border_type_t kleidicv_border_type;
+  if (from_opencv(border_type, kleidicv_border_type)) {
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+  }
+
+  kleidicv_interpolation_type_t kleidicv_interpolation_type;
+  if (from_opencv(interpolation, kleidicv_interpolation_type)) {
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+  }
+
+  auto border_value = get_border_value<uint8_t>(border_value_f64);
+
+  auto mt = get_multithreading();
+
+  // Only implement CV_8UC1 so far
+  if (src_type == CV_8UC1) {
+    return convert_error(kleidicv_thread_remap_f32_u8(
+        src_data, src_step, static_cast<size_t>(src_width),
+        static_cast<size_t>(src_height), dst_data, dst_step,
+        static_cast<size_t>(dst_width), static_cast<size_t>(dst_height), 1,
+        mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type,
+        kleidicv_border_type, border_value.data(), mt));
+  }
+
+  return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
 int pyrdown(const uchar *src_data, size_t src_step, int src_width,
             int src_height, uchar *dst_data, size_t dst_step, int dst_width,
             int dst_height, int depth, int cn, int border_type) {
diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h
index 2a5f04e31184457085174bcaf6af04c701ec3dd5..dff50167646408867c4f1c4381a18f4a6a7507f3 100644
--- a/adapters/opencv/kleidicv_hal.h
+++ b/adapters/opencv/kleidicv_hal.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -158,6 +158,12 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step,
                     const uint16_t *mapfrac, size_t mapfrac_step,
                     int border_type, const double border_value[4]);
 
+int remap_f32(int src_type, const uchar *src_data, size_t src_step,
+              int src_width, int src_height, uchar *dst_data, size_t dst_step,
+              int dst_width, int dst_height, float *mapx, size_t mapx_step,
+              float *mapy, size_t mapy_step, int interpolation, int border_type,
+              const double border_value[4]);
+
 int warp_perspective(int src_type, const uchar *src_data, size_t src_step,
                      int src_width, int src_height, uchar *dst_data,
                      size_t dst_step, int dst_width, int dst_height,
@@ -418,6 +424,20 @@ static inline int kleidicv_remap_s16point5_with_fallback(
 #define cv_hal_remap16s16u kleidicv_remap_s16point5_with_fallback
 #endif  // cv_hal_remap16s16u
 
+static inline int kleidicv_remap_f32_with_fallback(
+    int src_type, const uchar *src_data, size_t src_step, int src_width,
+    int src_height, uchar *dst_data, size_t dst_step, int dst_width,
+    int dst_height, float *mapx, size_t mapx_step, float *mapy,
+    size_t mapy_step, int interpolation, int border_type,
+    const double border_value[4]) {
+  return KLEIDICV_HAL_FALLBACK_FORWARD(
+      remap_f32, cv_hal_remap32f, src_type, src_data, src_step, src_width,
+      src_height, dst_data, dst_step, dst_width, dst_height, mapx, mapx_step,
+      mapy, mapy_step, interpolation, border_type, border_value);
+}
+#undef cv_hal_remap32f
+#define cv_hal_remap32f kleidicv_remap_f32_with_fallback
+
 // pyrdown
 static inline int kleidicv_pyrdown_with_fallback(
     const uchar *src_data, size_t src_step, int src_width, int src_height,
diff --git a/adapters/opencv/opencv-4.11.patch b/adapters/opencv/opencv-4.11.patch
index 30962741c0d7da2feea1cc40ca6f3d5188dabb65..2029c3026cf5b340b71969a86cad6fdcca7c3d5f 100644
--- a/adapters/opencv/opencv-4.11.patch
+++ b/adapters/opencv/opencv-4.11.patch
@@ -19,7 +19,7 @@ index 2b4035285f..729cd1dd43 100644
 @@ -281,6 +281,11 @@ void Mat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
      dst.create(dims, size, dtype);
      Mat dstMat = dst.getMat();
- 
+
 +    if( dims <= 2 ) {
 +        int width_in_elements = src.cols * cn;
 +        CALL_HAL(convertTo, cv_hal_convertTo, src.data, src.step, src.depth(), dstMat.data, dstMat.step, dstMat.depth(), width_in_elements, src.rows, alpha, beta);
@@ -35,7 +35,7 @@ index 474fe17393..5d5289cc16 100644
 @@ -1011,6 +1011,53 @@ inline int hal_ni_transpose2d(const uchar* src_data, size_t src_step, uchar* dst
  #define cv_hal_transpose2d hal_ni_transpose2d
  //! @endcond
- 
+
 +/**
 +   @brief sum
 +   @param src_data,src_step,src_type Source image
@@ -84,15 +84,15 @@ index 474fe17393..5d5289cc16 100644
 +//! @endcond
 +
  //! @}
- 
- 
+
+
 diff --git a/modules/core/src/sum.dispatch.cpp b/modules/core/src/sum.dispatch.cpp
 index fade948336..17b40ca0e8 100644
 --- a/modules/core/src/sum.dispatch.cpp
 +++ b/modules/core/src/sum.dispatch.cpp
 @@ -199,6 +199,10 @@ Scalar sum(InputArray _src)
      CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_sum(src, _res), _res);
- 
+
      int k, cn = src.channels(), depth = src.depth();
 +
 +    double result = 0;
@@ -100,7 +100,7 @@ index fade948336..17b40ca0e8 100644
 +
      SumFunc func = getSumFunc(depth);
      CV_Assert( cn <= 4 && func != 0 );
- 
+
 diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp
 index fe6019e3a7..b2d8c8b533 100644
 --- a/modules/imgproc/src/hal_replacement.hpp
@@ -108,7 +108,7 @@ index fe6019e3a7..b2d8c8b533 100644
 @@ -378,6 +378,60 @@ inline int hal_ni_remap32f(int src_type, const uchar *src_data, size_t src_step,
  #define cv_hal_remap32f hal_ni_remap32f
  //! @endcond
- 
+
 +/**
 +   @brief hal_remap with a short integer map
 +   @param src_type source and destination image type
@@ -183,8 +183,38 @@ index dfc718bf87..c1f953f230 100644
 +        CALL_HAL(remap16s16u, cv_hal_remap16s16u, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
 +                 map1.ptr<short>(), map1.step, map2.ptr<unsigned short>(), map2.step, borderType, borderValue.val);
      }
- 
+
      interpolation &= ~WARP_RELATIVE_MAP;
+diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp
+index f7dafbd956..13c1341716 100644
+--- a/modules/imgproc/src/smooth.dispatch.cpp
++++ b/modules/imgproc/src/smooth.dispatch.cpp
+@@ -655,6 +655,25 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
+             ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)
+     );
+
++    {
++        Mat src = _src.getMat();
++        Mat dst = _dst.getMat();
++
++        Point ofs;
++        Size wsz(src.cols, src.rows);
++        if(!(borderType & BORDER_ISOLATED))
++            src.locateROI( wsz, ofs );
++
++        if (sigma1 == 0.0 && sigma2 == 0.0 && ksize.height == ksize.width) {
++            CALL_HAL(gaussianBlurBinomial, cv_hal_gaussianBlurBinomial, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
++                    ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, borderType&~BORDER_ISOLATED);
++        }
++
++        CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
++                ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
++                sigma1, sigma2, borderType&~BORDER_ISOLATED);
++    }
++
+     if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.isSubmatrix()))
+     {
+         std::vector<ufixedpoint16> fkx, fky;
 diff --git a/modules/imgproc/test/test_imgwarp_strict.cpp b/modules/imgproc/test/test_imgwarp_strict.cpp
 index 673c6f03e6..56d9e0b554 100644
 --- a/modules/imgproc/test/test_imgwarp_strict.cpp
@@ -196,5 +226,22 @@ index 673c6f03e6..56d9e0b554 100644
 -        return 1.0f;
 +        return 4.0f;    // Reference algorithm uses integer interpolation with 5 bits fractional part, and this greater tolerance allows better precision algorithms to pass the test
  }
- 
+
  void CV_ImageWarpBaseTest::validate_results() const
+diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp
+index e8840d231b..adfaad38b0 100644
+--- a/modules/imgproc/test/test_imgwarp.cpp
++++ b/modules/imgproc/test/test_imgwarp.cpp
+@@ -1371,7 +1371,11 @@ TEST_P(Imgproc_RemapRelative, validity)
+         cv::remap(src, dstRelative, mapRelativeX32F, mapRelativeY32F, interpolation | WARP_RELATIVE_MAP, borderType);
+     }
+
+-    EXPECT_EQ(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 0);
++    if (interpolation != INTER_LINEAR) {
++        EXPECT_EQ(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 0);
++    } else {  // TODO: Check whether 4 is acceptable? f32_u8_linear_constant can be 4
++        EXPECT_LT(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 5); // Reference algorithm uses integer interpolation with 5 bits fractional part, and this greater tolerance allows better precision algorithms to pass the test
++    }
+ };
+
+ INSTANTIATE_TEST_CASE_P(ImgProc, Imgproc_RemapRelative, testing::Combine(
diff --git a/adapters/opencv/opencv-5.x.patch b/adapters/opencv/opencv-5.x.patch
index 3f0298d27d4acd506e66a2c03e2e3c3fb218216d..bf7ce73b983c604397150284c24d1d973acb3f6a 100644
--- a/adapters/opencv/opencv-5.x.patch
+++ b/adapters/opencv/opencv-5.x.patch
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -170,3 +170,20 @@ index 83a5e23781..75df962fb7 100644
 -        return 1.0f;
 +        return 4.0f;    // Reference algorithm uses integer interpolation with 5 bits fractional part, and this greater tolerance allows better precision algorithms to pass the test
  }
+diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp
+index aded7bb74e..adf6a67e2b 100644
+--- a/modules/imgproc/test/test_imgwarp.cpp
++++ b/modules/imgproc/test/test_imgwarp.cpp
+@@ -811,7 +811,11 @@ TEST_P(Imgproc_RemapRelative, validity)
+         cv::remap(src, dstRelative, mapRelativeX32F, mapRelativeY32F, interpolation | WARP_RELATIVE_MAP, borderType);
+     }
+
+-    EXPECT_EQ(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 0);
++    if (interpolation != INTER_LINEAR) {
++        EXPECT_EQ(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 0);
++    } else {
++        EXPECT_LT(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 4); // Reference algorithm uses integer interpolation with 5 bits fractional part, and this greater tolerance allows better precision algorithms to pass the test
++    }
+ };
+
+ INSTANTIATE_TEST_CASE_P(ImgProc, Imgproc_RemapRelative, testing::Combine(
diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp
index 6034acc286658fa0ebfcdc202270940229d17328..89d6b0ebab5a1cedc906affa336d103bc58bfdda 100644
--- a/conformity/opencv/test_remap.cpp
+++ b/conformity/opencv/test_remap.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -18,8 +18,14 @@ static cv::Mat get_source_mat(int format) {
     cv::Mat m(kMaxHeight, kMaxWidth, format);
     for (size_t row = 0; row < kMaxHeight; ++row) {
       for (size_t column = 0; column < kMaxWidth; ++column) {
-        m.at<ScalarType>(row, column) =
-            (row * kMaxWidth + column) % std::numeric_limits<ScalarType>::max();
+        // Referring to the conformity check in test_warp_perspective
+        // Remap calculation in float greatly amplifies any small errors
+        // coming from precision innaccuracies, but ensuring that neighbouring
+        // pixels have neighbouring values decreases this effect, so it can be
+        // expected that the error won't be bigger than 1.
+        const int kMaxValue = std::numeric_limits<ScalarType>::max();
+        m.at<ScalarType>(row, column) = abs(
+            static_cast<int>(row + column) % (2 * kMaxValue + 1) - kMaxValue);
       }
     }
     return m;
@@ -79,8 +85,7 @@ template <class ScalarType, int Format, int Interpolation, int BorderMode,
           int BorderValue>
 cv::Mat exec_remap_s16point5(cv::Mat& map_mat) {
   cv::Mat empty;
-  // integer part is 16SC2, that is twice as much data as the fractional part,
-  // 16UC1
+  // integer part is 16SC2, twice as much data as the fractional part, 16UC1
   int height = map_mat.rows * 2 / 3;
   cv::Mat mapxy_mat = map_mat.rowRange(0, height);
   ushort* p_frac = map_mat.rowRange(height, map_mat.rows).ptr<ushort>();
@@ -131,6 +136,64 @@ bool test_remap_s16point5(int index, RecreatedMessageQueue& request_queue,
 }
 #endif
 
+// BorderValue is interpreted as 1/1000, i.e. 500 for 0.5
+template <class ScalarType, int Format, int Interpolation, int BorderMode,
+          int BorderValue>
+cv::Mat exec_remap_f32(cv::Mat& mapxy_mat) {
+  cv::Mat source_mat = get_source_mat<ScalarType>(Format);
+  cv::Mat result(mapxy_mat.rows, mapxy_mat.cols, Format);
+
+  cv::Mat mapx_mat = mapxy_mat.rowRange(0, mapxy_mat.rows / 2);
+  cv::Mat mapy_mat = mapxy_mat.rowRange(mapxy_mat.rows / 2, mapxy_mat.rows);
+
+  remap(source_mat, result, mapx_mat, mapy_mat, Interpolation, BorderMode,
+        BorderValue / 1000.0);
+  return result;
+}
+
+#if MANAGER
+template <class ScalarType, int Format, int Interpolation, int BorderMode,
+          int BorderValue>
+bool test_remap_f32(int index, RecreatedMessageQueue& request_queue,
+                    RecreatedMessageQueue& reply_queue) {
+  cv::Mat source_mat = get_source_mat<ScalarType>(Format);
+  cv::RNG rng(0);
+
+  for (size_t w = 5; w <= kMaxWidth; w += 3) {
+    for (size_t h = 5; h <= kMaxHeight; h += 2) {
+      cv::Mat map_mat(h * 2, w, CV_32FC1);
+      cv::Mat mapx_mat = map_mat.rowRange(0, h);
+      rng.fill(mapx_mat, cv::RNG::UNIFORM, -3, kMaxWidth + 3);
+
+      cv::Mat mapy_mat = map_mat.rowRange(h, map_mat.rows);
+      rng.fill(mapy_mat, cv::RNG::UNIFORM, -3, kMaxHeight + 3);
+
+      cv::Mat actual_mat = exec_remap_f32<ScalarType, Format, Interpolation,
+                                          BorderMode, BorderValue>(map_mat);
+
+      cv::Mat expected_mat = get_expected_from_subordinate(
+          index, request_queue, reply_queue, map_mat);
+
+      bool success =
+          (CV_MAT_DEPTH(Format) == CV_8U &&
+           !are_matrices_different<uint8_t>(2, actual_mat, expected_mat)) ||
+          (CV_MAT_DEPTH(Format) == CV_16U &&
+           !are_matrices_different<uint16_t>(2, actual_mat, expected_mat));
+      if (!success) {
+        fail_print_matrices(w, h, source_mat, actual_mat, expected_mat);
+        std::cout << "=== mapx_mat:" << std::endl;
+        std::cout << mapx_mat << std::endl << std::endl;
+        std::cout << "=== mapy_mat:" << std::endl;
+        std::cout << mapy_mat << std::endl << std::endl;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+#endif
+
 std::vector<test>& remap_tests_get() {
   // clang-format off
   static std::vector<test> tests = {
@@ -138,11 +201,13 @@ std::vector<test>& remap_tests_get() {
     TEST("RemapS16 uint16 Replicate", (test_remap_s16<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_remap_s16<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
     TEST("RemapS16Point5 uint8 Replicate", (test_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
     TEST("RemapS16Point5 uint16 Replicate", (test_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
+    TEST("RemapF32 uint8 Replicate", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
 
-    TEST("RemapS16 uint8 Constant", (test_remap_s16<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 321>), (exec_remap_s16<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 321>)),
-    TEST("RemapS16 uint16 Constant", (test_remap_s16<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 321>), (exec_remap_s16<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 321>)),
-    TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 321>), (exec_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 321>)),
-    TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 321>), (exec_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 321>)),
+    TEST("RemapS16 uint8 Constant", (test_remap_s16<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>), (exec_remap_s16<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>)),
+    TEST("RemapS16 uint16 Constant", (test_remap_s16<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>), (exec_remap_s16<uint16_t, CV_16UC1, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 12321>)),
+    TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_s16point5<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
+    TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>), (exec_remap_s16point5<uint16_t, CV_16UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 12321>)),
+    TEST("RemapF32 uint8 Constant", (test_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>), (exec_remap_f32<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 123210>)),
   };
   // clang-format on
   return tests;
diff --git a/conformity/opencv/utils.h b/conformity/opencv/utils.h
index ef5ae65fcb6ffff7ec0cf647ee50b46938de63a8..bf19e7b13f8e773575e073a7280bb72a9bd27fab 100644
--- a/conformity/opencv/utils.h
+++ b/conformity/opencv/utils.h
@@ -72,7 +72,9 @@ bool are_matrices_different(T threshold, cv::Mat& A, cv::Mat& B) {
   for (int i = 0; i < A.rows; ++i) {
     for (int j = 0; j < (A.cols * CV_MAT_CN(A.type())); ++j) {
       if (abs_diff<T>(A.at<T>(i, j), B.at<T>(i, j)) > threshold) {
-        std::cout << "=== Mismatch at: " << i << " " << j << std::endl
+        std::cout << "=== Mismatch at [" << i << ", " << j
+                  << "]: " << +A.at<T>(i, j) << " vs. " << +B.at<T>(i, j)
+                  << std::endl
                   << std::endl;
         return true;
       }
diff --git a/doc/functionality.md b/doc/functionality.md
index 2f2e2eed54dc3b65ee3674d10b5f1a652265a3b4..ab124b7ccd899e8ad037edc5e2391206cf8a94b7 100644
--- a/doc/functionality.md
+++ b/doc/functionality.md
@@ -1,5 +1,5 @@
 <!--
-SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 SPDX-License-Identifier: Apache-2.0
 -->
@@ -97,6 +97,7 @@ See `doc/opencv.md` for details of the functionality available in OpenCV.
 |--------------------------------------------|-----|-----|
 | Remap int16 coordinates                    |  x  |  x  |
 | Remap int16+uint16 fixed-point coordinates |  x  |  x  |
+| Remap float32 coordinates                  |  x  |     |
 
 # WarpPerspective
 |                      |  u8 |
diff --git a/doc/opencv.md b/doc/opencv.md
index b9db1d3050df02546b4d7c1f7923d5ae7f8c99b2..2c88467c32eeed251731dcf920aeeea2aa2b1979 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -1,5 +1,5 @@
 <!--
-SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 SPDX-License-Identifier: Apache-2.0
 -->
@@ -220,6 +220,10 @@ Supported map configurations:
 * `map1.type()` is `CV_16SC2` and `map2.type()` is `CV_16UC1` - fixed-point representation as generated by [`cv::convertMaps`](https://docs.opencv.org/4.11.0/da/d54/group__imgproc__transform.html#ga9156732fa8f01be9ebd1a194f2728b7f):
   * > ⚠️ **Acceleration will not work unless OpenCV is built from source patched with `opencv-4.11.patch`**
   * supported `interpolation`: `INTER_LINEAR` only
+* `map1` is 32FC1 and `map2` is 32FC1:
+  * `map1` is x coordinates (column)
+  * `map2` is y coordinates (row)
+  * supported `interpolation`: `INTER_LINEAR` only
 
 ### [`cv::warpPerspective()`](https://docs.opencv.org/4.10.0/da/d54/group__imgproc__transform.html#gaf73673a7e8e18ec6963e3774e6a94b87)
 Performs a perspective transformation on an image.
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index 9c6cdfa1d1b6c34be0c43cad436c1585be73901f..29c088ab021c867bd74f62bb8bc741cc5e1b4112 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -1834,6 +1834,60 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u16, const uint16_t *src,
                          const uint16_t *border_value);
 #endif  // DOXYGEN
 
+/// Transforms the `src` image by taking the pixels specified by the coordinates
+/// from the `mapxy` image.
+///
+/// Width and height are the same for `mapx`, `mapy` and for `dst`. `src`
+/// dimensions may be different, but due to the limits of 32-bit float format,
+/// its width and height must be less than 2^24. Coordinates outside of `src`
+/// dimensions are considered border.
+///
+/// @param src            Pointer to the source data. Must be non-null.
+/// @param src_stride     Distance in bytes from the start of one row to the
+///                       start of the next row for the source data. Must
+///                       not be less than `width * sizeof(type)`, except for
+///                       single-row images. Must be less than 2^32.
+/// @param src_width      Number of elements in the source row. Must be less
+///                       than 2^24.
+/// @param src_height     Number of rows in the source data. Must be less than
+///                       2^24.
+/// @param dst            Pointer to the destination data. Must be non-null.
+/// @param dst_stride     Distance in bytes from the start of one row to the
+///                       start of the next row for the destination data.
+///                       Must be a multiple of `sizeof(type)` and no less than
+///                       `width * sizeof(type)`, except for single-row images.
+/// @param dst_width      Number of elements in a destination row. Must be at
+///                       least 4.
+/// @param dst_height     Number of rows in the destination data.
+/// @param channels       Number of channels in the (source and destination)
+/// data. Must be 1.
+/// @param mapx           Pointer to the x coordinates' data. Must be non-null.
+/// @param mapx_stride    Distance in bytes from the start of one row to the
+///                       start of the next row for `mapx`. Must be a multiple
+///                       of `sizeof(float)` and no less than `width *
+///                       sizeof(float)`, except for single-row images.
+/// @param mapy           Pointer to the y coordinates' data. Must be non-null.
+/// @param mapy_stride    Distance in bytes from the start of one row to the
+///                       start of the next row for `mapy`. Must be a multiple
+///                       of `sizeof(float)` and no less than `width *
+///                       sizeof(float)`, except for single-row images.
+/// @param interpolation  Interpolation algorithm. Supported types: \n
+///                         - @ref KLEIDICV_INTERPOLATION_LINEAR
+/// @param border_type    Way of handling the border. The supported border types
+///                       are: \n
+///                         - @ref KLEIDICV_BORDER_TYPE_REPLICATE
+///                         - @ref KLEIDICV_BORDER_TYPE_CONSTANT
+/// @param border_value   Border values if the border_type is
+///                       @ref KLEIDICV_BORDER_TYPE_CONSTANT.
+KLEIDICV_API_DECLARATION(kleidicv_remap_f32_u8, const uint8_t *src,
+                         size_t src_stride, size_t src_width, size_t src_height,
+                         uint8_t *dst, size_t dst_stride, size_t dst_width,
+                         size_t dst_height, size_t channels, float *mapx,
+                         size_t mapx_stride, float *mapy, size_t mapy_stride,
+                         kleidicv_interpolation_type_t interpolation,
+                         kleidicv_border_type_t border_type,
+                         const uint8_t *border_value);
+
 #ifndef DOXYGEN
 /// Internal - not part of the public API and its direct use is not supported.
 ///
diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h
index 78dd1e8c58dec60d0f59d88de9e097b9178338e9..6dc74385993171844cabb5241fc27055e7f45b26 100644
--- a/kleidicv/include/kleidicv/transform/remap.h
+++ b/kleidicv/include/kleidicv/transform/remap.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -50,6 +50,26 @@ inline bool remap_s16point5_is_implemented(
   }
 }
 
+template <typename T>
+inline bool remap_f32_is_implemented(
+    size_t src_stride, size_t src_width, size_t src_height, size_t dst_width,
+    kleidicv_border_type_t border_type, size_t channels,
+    kleidicv_interpolation_type_t interpolation) KLEIDICV_STREAMING_COMPATIBLE {
+  if constexpr (std::is_same<T, uint8_t>::value) {
+    return (
+        src_stride <= std::numeric_limits<uint32_t>::max() && dst_width >= 4 &&
+        src_width <=
+            static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) + 1 &&
+        src_height <=
+            static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) + 1 &&
+        (border_type == KLEIDICV_BORDER_TYPE_REPLICATE ||
+         border_type == KLEIDICV_BORDER_TYPE_CONSTANT) &&
+        channels == 1 && interpolation == KLEIDICV_INTERPOLATION_LINEAR);
+  } else {
+    return false;
+  }
+}
+
 // Constants for Remap16Point5
 static const uint16_t REMAP16POINT5_FRAC_BITS = 5;
 static const uint16_t REMAP16POINT5_FRAC_MAX = 1 << REMAP16POINT5_FRAC_BITS;
@@ -76,6 +96,16 @@ kleidicv_error_t remap_s16point5(const T *src, size_t src_stride,
                                  kleidicv_border_type_t border_type,
                                  const T *border_value);
 
+template <typename T>
+kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
+                           size_t src_height, T *dst, size_t dst_stride,
+                           size_t dst_width, size_t dst_height, size_t channels,
+                           float *mapx, size_t mapx_stride, float *mapy,
+                           size_t mapy_stride,
+                           kleidicv_interpolation_type_t interpolation,
+                           kleidicv_border_type_t border_type,
+                           const T *border_value);
+
 }  // namespace neon
 
 namespace sve2 {
@@ -98,6 +128,15 @@ kleidicv_error_t remap_s16point5(const T *src, size_t src_stride,
                                  kleidicv_border_type_t border_type,
                                  const T *border_value);
 
+template <typename T>
+kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
+                           size_t src_height, T *dst, size_t dst_stride,
+                           size_t dst_width, size_t dst_height, size_t channels,
+                           float *mapx, size_t mapx_stride, float *mapy,
+                           size_t mapy_stride,
+                           kleidicv_interpolation_type_t interpolation,
+                           kleidicv_border_type_t border_type,
+                           const T *border_value);
 }  // namespace sve2
 
 namespace sme2 {
diff --git a/kleidicv/src/transform/common_sc.h b/kleidicv/src/transform/common_sc.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7a7eb01825dbc8bd578cc786f6ceba5a28c15a3
--- /dev/null
+++ b/kleidicv/src/transform/common_sc.h
@@ -0,0 +1,202 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arm_sve.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+
+#include "kleidicv/ctypes.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/sve2.h"
+#include "kleidicv/traits.h"
+#include "kleidicv/types.h"
+
+namespace KLEIDICV_TARGET_NAMESPACE {
+
+// Convert border_type to a template argument.
+template <typename ScalarType, bool IsLarge,
+          kleidicv_interpolation_type_t Inter, typename... Args>
+void remap32f_process_rows(kleidicv_border_type_t border_type, Args &&...args) {
+  if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) {
+    remap32f_process_rows<ScalarType, IsLarge, Inter,
+                          KLEIDICV_BORDER_TYPE_REPLICATE>(
+        std::forward<Args>(args)...);
+  } else {
+    remap32f_process_rows<ScalarType, IsLarge, Inter,
+                          KLEIDICV_BORDER_TYPE_CONSTANT>(
+        std::forward<Args>(args)...);
+  }
+}
+
+// Convert interpolation_type to a template argument.
+template <typename ScalarType, bool IsLarge, typename... Args>
+void remap32f_process_rows(kleidicv_interpolation_type_t interpolation_type,
+                           Args &&...args) {
+  if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) {
+    remap32f_process_rows<ScalarType, IsLarge, KLEIDICV_INTERPOLATION_NEAREST>(
+        std::forward<Args>(args)...);
+  } else {
+    remap32f_process_rows<ScalarType, IsLarge, KLEIDICV_INTERPOLATION_LINEAR>(
+        std::forward<Args>(args)...);
+  }
+}
+
+template <typename T>
+bool remap_image_is_large(const Rows<T> &rows, size_t height) {
+  return rows.stride() * height >= 1ULL << 32;
+}
+
+// Convert is_large to a template argument.
+template <typename ScalarType, typename... Args>
+void remap32f_process_rows(bool is_large, Args &&...args) {
+  if (KLEIDICV_UNLIKELY(is_large)) {
+    remap32f_process_rows<ScalarType, true>(std::forward<Args>(args)...);
+  } else {
+    remap32f_process_rows<ScalarType, false>(std::forward<Args>(args)...);
+  }
+}
+
+template <typename ScalarType, bool IsLarge>
+svuint32_t inline load_common(svbool_t pg, svuint32_t x, svuint32_t y,
+                              svuint32_t sv_src_stride,
+                              Rows<const ScalarType> &src_rows) {
+  if constexpr (IsLarge) {
+    svbool_t pg_b = pg;
+    svbool_t pg_t = svtrn2_b32(pg, svpfalse());
+
+    // Calculate offsets from coordinates (y * stride + x)
+    // To avoid losing precision, the final offsets should be in 64 bits
+    svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride);
+    svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride);
+    // Copy pixels from source
+    svuint64_t result_b =
+        svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
+    svuint64_t result_t =
+        svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
+    return svtrn1_u32(svreinterpret_u32_u64(result_b),
+                      svreinterpret_u32_u64(result_t));
+  } else {
+    svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
+    return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets);
+  }
+}
+
+template <typename ScalarType, bool IsLarge>
+svuint32_t inline calculate_linear_replicate(svbool_t pg, svfloat32x2_t coords,
+                                             svfloat32_t xmaxf,
+                                             svfloat32_t ymaxf,
+                                             svuint32_t sv_src_stride,
+                                             Rows<const ScalarType> &src_rows) {
+  auto load_source = [&](svuint32_t x, svuint32_t y) {
+    return load_common<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows);
+  };
+  svbool_t pg_all32 = svptrue_b32();
+  svfloat32_t xf = svget2(coords, 0);
+  svfloat32_t yf = svget2(coords, 1);
+  // Take the integer part, clamp it to within the dimensions of the
+  // source image (negative values are already saturated to 0)
+  svuint32_t x0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, xf, xmaxf));
+  svuint32_t y0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, yf, ymaxf));
+
+  // Get fractional part, or 0 if out of range
+  svbool_t x_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, xf, 0.F),
+                                svcmplt_f32(pg_all32, xf, xmaxf));
+  svbool_t y_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, yf, 0.F),
+                                svcmplt_f32(pg_all32, yf, ymaxf));
+  svfloat32_t xfrac =
+      svsel_f32(x_in_range, svsub_f32_x(pg_all32, xf, svrintm_x(pg_all32, xf)),
+                svdup_n_f32(0.F));
+  svfloat32_t yfrac =
+      svsel_f32(y_in_range, svsub_f32_x(pg_all32, yf, svrintm_x(pg_all32, yf)),
+                svdup_n_f32(0.F));
+
+  // x1 = x0 + 1, except if it's already xmax or out of range
+  svuint32_t x1 = svsel_u32(x_in_range, svadd_n_u32_x(pg_all32, x0, 1), x0);
+  svuint32_t y1 = svsel_u32(y_in_range, svadd_n_u32_x(pg_all32, y0, 1), y0);
+
+  // Calculate offsets from coordinates (y * stride + x)
+  // a: top left, b: top right, c: bottom left, d: bottom right
+  svfloat32_t a = svcvt_f32_u32_x(pg_all32, load_source(x0, y0));
+  svfloat32_t b = svcvt_f32_u32_x(pg_all32, load_source(x1, y0));
+  svfloat32_t line0 =
+      svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
+  svfloat32_t c = svcvt_f32_u32_x(pg_all32, load_source(x0, y1));
+  svfloat32_t d = svcvt_f32_u32_x(pg_all32, load_source(x1, y1));
+  svfloat32_t line1 =
+      svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
+  svfloat32_t result =
+      svmla_f32_x(pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
+  return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F));
+}
+
+template <typename ScalarType, bool IsLarge>
+svuint32_t get_pixels_or_border(svbool_t pg, svuint32_t x, svuint32_t y,
+                                svuint32_t sv_border, svuint32_t sv_xmax,
+                                svuint32_t sv_ymax, svuint32_t sv_src_stride,
+                                Rows<const ScalarType> &src_rows) {
+  svbool_t in_range =
+      svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
+  svuint32_t result =
+      load_common<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows);
+  // Select between source pixels and border colour
+  return svsel_u32(in_range, result, sv_border);
+}
+
+template <typename ScalarType, bool IsLarge>
+svuint32_t inline calculate_linear_constant_border(
+    svbool_t pg, svfloat32x2_t coords, svuint32_t sv_border, svuint32_t sv_xmax,
+    svuint32_t sv_ymax, svuint32_t sv_src_stride,
+    Rows<const ScalarType> &src_rows) {
+  svfloat32_t xf = svget2(coords, 0);
+  svfloat32_t yf = svget2(coords, 1);
+
+  // Convert obviously out-of-range coordinates to values that are just beyond
+  // the largest permitted image width & height. This avoids the need for
+  // special case handling elsewhere.
+  svfloat32_t big = svdup_n_f32(1 << 24);
+  xf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, xf), big), xf, big);
+  yf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, yf), big), yf, big);
+
+  svfloat32_t xf0 = svrintm_f32_x(pg, xf);
+  svfloat32_t yf0 = svrintm_f32_x(pg, yf);
+
+  svint32_t x0 = svcvt_s32_x(pg, xf0);
+  svint32_t y0 = svcvt_s32_x(pg, yf0);
+  svint32_t x1 = svadd_s32_x(pg, x0, svdup_n_s32(1));
+  svint32_t y1 = svadd_s32_x(pg, y0, svdup_n_s32(1));
+
+  svfloat32_t xfrac = svsub_f32_x(pg, xf, xf0);
+  svfloat32_t yfrac = svsub_f32_x(pg, yf, yf0);
+
+  svfloat32_t a = svcvt_f32_u32_x(
+      pg, get_pixels_or_border<ScalarType, IsLarge>(
+              pg, svreinterpret_u32_s32(x0), svreinterpret_u32_s32(y0),
+              sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows));
+  svfloat32_t b = svcvt_f32_u32_x(
+      pg, get_pixels_or_border<ScalarType, IsLarge>(
+              pg, svreinterpret_u32_s32(x1), svreinterpret_u32_s32(y0),
+              sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows));
+  svfloat32_t line0 = svmla_f32_x(pg, a, svsub_f32_x(pg, b, a), xfrac);
+  svfloat32_t c = svcvt_f32_u32_x(
+      pg, get_pixels_or_border<ScalarType, IsLarge>(
+              pg, svreinterpret_u32_s32(x0), svreinterpret_u32_s32(y1),
+              sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows));
+  svfloat32_t d = svcvt_f32_u32_x(
+      pg, get_pixels_or_border<ScalarType, IsLarge>(
+              pg, svreinterpret_u32_s32(x1), svreinterpret_u32_s32(y1),
+              sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows));
+  svfloat32_t line1 = svmla_f32_x(pg, c, svsub_f32_x(pg, d, c), xfrac);
+  svfloat32_t result =
+      svmla_f32_x(pg, line0, svsub_f32_x(pg, line1, line0), yfrac);
+  return svcvt_u32_f32_x(pg, svrinta_f32_x(pg, result));
+}
+
+}  // namespace KLEIDICV_TARGET_NAMESPACE
diff --git a/kleidicv/src/transform/remap_api.cpp b/kleidicv/src/transform/remap_api.cpp
index bda9c17f6958a5d405c4593bc0e7bcf733820020..9debeda611df78a246cc43c01562f3a265a979ef 100644
--- a/kleidicv/src/transform/remap_api.cpp
+++ b/kleidicv/src/transform/remap_api.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -22,3 +22,7 @@ KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_s16point5_u16,
                             &kleidicv::neon::remap_s16point5<uint16_t>,
                             &kleidicv::sve2::remap_s16point5<uint16_t>,
                             nullptr);
+
+KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_f32_u8,
+                            &kleidicv::neon::remap_f32<uint8_t>,
+                            &kleidicv::sve2::remap_f32<uint8_t>, nullptr);
diff --git a/kleidicv/src/transform/remap_neon.cpp b/kleidicv/src/transform/remap_neon.cpp
index a540daf56a69b92310a1bb87de6ee1b08f217b5e..c671c1d9ee31344569e0c8473ed31012f559114c 100644
--- a/kleidicv/src/transform/remap_neon.cpp
+++ b/kleidicv/src/transform/remap_neon.cpp
@@ -1,8 +1,9 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
 #include <cassert>
+#include <cstdint>  // TODO: check
 #include <type_traits>
 
 #include "kleidicv/kleidicv.h"
@@ -259,9 +260,11 @@ class RemapS16Point5Replicate<uint8_t> {
       FracVectorType frac = vld1q_u16(&mapfrac[0]);
       uint16x8_t xfrac =
           vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0),
+                    // extract xfrac = frac[0:4]
                     vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)));
       uint16x8_t yfrac =
           vbslq_u16(vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0),
+                    // extract yfrac = frac[5:9]
                     vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS),
                               vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)));
       uint16x8_t nxfrac = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac);
@@ -887,6 +890,392 @@ kleidicv_error_t remap_s16point5(
 }
 // NOLINTEND(readability-function-cognitive-complexity)
 
+template <typename ScalarType, bool IsLarge>
+class RemapF32Replicate;
+
+template <bool IsLarge>
+class RemapF32Replicate<uint8_t, IsLarge> {
+ public:
+  using ScalarType = uint8_t;
+  using MapVecTraits = neon::VecTraits<float>;
+  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
+
+  RemapF32Replicate(Rows<const ScalarType> src_rows, size_t src_width,
+                    size_t src_height)
+      : src_rows_{src_rows},
+        v_src_stride_{vdup_n_u32(static_cast<uint32_t>(src_rows_.stride()))},
+        vq_src_stride_{vdupq_n_u32(static_cast<uint32_t>(src_rows_.stride()))},
+        v_xmax_{vdupq_n_u32(static_cast<uint32_t>(src_width - 1))},
+        v_ymax_{vdupq_n_u32(static_cast<uint32_t>(src_height - 1))} {}
+
+  void process_row(size_t width, Columns<const float> mapx,
+                   Columns<const float> mapy, Columns<ScalarType> dst) {
+    const size_t kStep = VecTraits<float>::num_lanes();
+
+    auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) {
+      uint32x4_t offset = vmlaq_u32(x, y, vq_src_stride_);
+      uint64_t acc =
+          static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 0)]) |
+          (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 1)]) << 32);
+      uint64x2_t rawsrc = vdupq_n_u64(acc);
+      acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 2)]) |
+            (static_cast<uint64_t>(src_rows_[vgetq_lane_u32(offset, 3)]) << 32);
+      rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
+      return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
+    };
+
+    auto load_src_into_floats_large = [&](uint32x4_t x, uint32x4_t y) {
+      uint64x2_t offset_low =
+          vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), v_src_stride_);
+      uint64x2_t offset_high =
+          vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_stride_);
+      uint64_t acc =
+          static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_low, 0)]) |
+          (static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_low, 1)])
+           << 32);
+      uint64x2_t rawsrc = vdupq_n_u64(acc);
+      acc = static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_high, 0)]) |
+            (static_cast<uint64_t>(src_rows_[vgetq_lane_u64(offset_high, 1)])
+             << 32);
+      rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
+      return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
+    };
+
+    auto load = [&](uint32x4_t x, uint32x4_t y) {
+      if constexpr (IsLarge) {
+        return load_src_into_floats_large(x, y);
+      } else {
+        return load_src_into_floats_small(x, y);
+      }
+    };
+
+    auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) {
+      MapVectorType x = vld1q_f32(ptr_mapx);
+      MapVectorType y = vld1q_f32(ptr_mapy);
+      // Truncating convert to int
+      uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(x), v_xmax_);
+      uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(y), v_ymax_);
+
+      // Get fractional part, or 0 if out of range
+      float32x4_t zero = vdupq_n_f32(0.F);
+      uint32x4_t x_in_range =
+          vandq_u32(vcgeq_f32(x, zero), vcltq_u32(x0, v_xmax_));
+      uint32x4_t y_in_range =
+          vandq_u32(vcgeq_f32(y, zero), vcltq_u32(y0, v_ymax_));
+      float32x4_t xfrac =
+          vbslq_f32(x_in_range, vsubq_f32(x, vrndmq_f32(x)), zero);
+      float32x4_t yfrac =
+          vbslq_f32(y_in_range, vsubq_f32(y, vrndmq_f32(y)), zero);
+
+      // x1 = x0 + 1, except if it's already xmax or out of range
+      uint32x4_t x1 = vsubq_u32(x0, x_in_range);
+      uint32x4_t y1 = vsubq_u32(y0, y_in_range);
+
+      // Calculate offsets from coordinates (y * stride + x)
+      // a: top left, b: top right, c: bottom left, d: bottom right
+      float32x4_t a = load(x0, y0);
+      float32x4_t b = load(x1, y0);
+      float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
+      float32x4_t c = load(x0, y1);
+      float32x4_t d = load(x1, y1);
+      float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
+      float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
+      return vminq_u32(vdupq_n_u32(0xFF), vcvtaq_u32_f32(result));
+    };
+
+    auto vector_path_4 = [&](size_t step) {  // step = 4*4 = 16
+      const float *ptr_mapx = &mapx[0];
+      const float *ptr_mapy = &mapy[0];
+      uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy);
+
+      ptr_mapx += kStep;
+      ptr_mapy += kStep;
+      uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy);
+      uint16x8_t result16_0 = vuzp1q_u16(res0, res1);
+
+      ptr_mapx += kStep;
+      ptr_mapy += kStep;
+      res0 = vector_path_1(ptr_mapx, ptr_mapy);
+
+      ptr_mapx += kStep;
+      ptr_mapy += kStep;
+      res1 = vector_path_1(ptr_mapx, ptr_mapy);
+      uint16x8_t result16_1 = vuzp1q_u16(res0, res1);
+      vst1q_u8(&dst[0], vuzp1q_u8(result16_0, result16_1));
+      mapx += ptrdiff_t(step);
+      mapy += ptrdiff_t(step);
+      dst += ptrdiff_t(step);
+    };
+
+    LoopUnroll loop{width, kStep};
+    loop.unroll_four_times(vector_path_4);
+    loop.unroll_once([&](size_t step) {
+      const float *ptr_mapx = &mapx[0];
+      const float *ptr_mapy = &mapy[0];
+      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
+      dst[0] = vgetq_lane_u32(result, 0);
+      dst[1] = vgetq_lane_u32(result, 1);
+      dst[2] = vgetq_lane_u32(result, 2);
+      dst[3] = vgetq_lane_u32(result, 3);
+      mapx += ptrdiff_t(step);
+      mapy += ptrdiff_t(step);
+      dst += ptrdiff_t(step);
+    });
+    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
+                          static_cast<ptrdiff_t>(loop.remaining_length());
+    mapx -= back_step;
+    mapy -= back_step;
+    dst -= back_step;
+    loop.remaining([&](size_t, size_t) {
+      const float *ptr_mapx = &mapx[0];
+      const float *ptr_mapy = &mapy[0];
+      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
+      dst[0] = vgetq_lane_u32(result, 0);
+      dst[1] = vgetq_lane_u32(result, 1);
+      dst[2] = vgetq_lane_u32(result, 2);
+      dst[3] = vgetq_lane_u32(result, 3);
+    });
+  }
+
+ private:
+  Rows<const ScalarType> src_rows_;
+  uint32x2_t v_src_stride_;   // load_large
+  uint32x4_t vq_src_stride_;  // load_small
+  uint32x4_t v_xmax_;
+  uint32x4_t v_ymax_;
+};  // end of class RemapF32Replicate<uint8_t>
+
+template <typename ScalarType, bool IsLarge>
+class RemapF32ConstantBorder;
+
+// TODO: Need to refactor to reduce the complexity
+// NOLINTBEGIN(readability-function-cognitive-complexity)
+template <bool IsLarge>
+class RemapF32ConstantBorder<uint8_t, IsLarge> {
+ public:
+  using ScalarType = uint8_t;
+  using MapVecTraits = neon::VecTraits<float>;
+  using MapVectorType = typename MapVecTraits::VectorType;  // float32x4_t
+
+  RemapF32ConstantBorder(Rows<const ScalarType> src_rows, size_t src_width,
+                         size_t src_height, const ScalarType *border_value)
+      : src_rows_{src_rows},
+        src_width_{src_width},
+        src_height_{src_height},
+        border_value_{border_value} {}
+
+  void process_row(size_t width, Columns<const float> mapx,
+                   Columns<const float> mapy, Columns<ScalarType> dst) {
+    const size_t kStep = VecTraits<float>::num_lanes();
+
+    auto get_edge_pixels =
+        [&](unsigned &a_result, unsigned &b_result, unsigned &c_result,
+            unsigned &d_result, int x0, int y0, ptrdiff_t offset,
+            Rows<const ScalarType> src_rows, int src_width, int src_height) {
+          if (y0 >= 0) {
+            if (x0 >= 0) {
+              a_result = src_rows[offset];
+            }
+            if (x0 + 1 < src_width) {
+              b_result = src_rows[offset + 1];
+            }
+          }
+          if (y0 + 1 < src_height) {
+            offset += static_cast<ptrdiff_t>(src_rows.stride());
+            if (x0 >= 0) {
+              c_result = src_rows[offset];
+            }
+            if (x0 + 1 < src_width) {
+              d_result = src_rows[offset + 1];
+            }
+          }
+        };
+
+    auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) {
+      MapVectorType xf = vld1q_f32(ptr_mapx);
+      MapVectorType yf = vld1q_f32(ptr_mapy);
+      // Convert obviously out-of-range coordinates to values that are just
+      // beyond the largest permitted image width & height. This avoids the need
+      // for special case handling elsewhere.
+      float32x4_t big = vdupq_n_f32(1 << 24);
+      xf = vbslq_f32(vcleq_f32(vabsq_f32(xf), big), xf, big);
+      yf = vbslq_f32(vcleq_f32(vabsq_f32(yf), big), yf, big);
+
+      int32x4_t x0 = vcvtmq_s32_f32(xf);
+      int32x4_t y0 = vcvtmq_s32_f32(yf);
+      int x0_array[4], y0_array[4];
+      unsigned a_array[4], b_array[4], c_array[4], d_array[4];
+      vst1q_s32(x0_array, x0);
+      vst1q_s32(y0_array, y0);
+      for (int i = 0; i < 4; ++i) {
+        int x0i = x0_array[i];
+        int y0i = y0_array[i];
+        ptrdiff_t offset = x0i + y0i * src_rows_.stride();
+
+        // src_width < (1ULL << 24) && src_height_ < (1ULL << 24) is guaranteed
+        if (x0i < 0 || x0i + 1 >= static_cast<int>(src_width_) || y0i < 0 ||
+            y0i + 1 >= static_cast<int>(src_height_)) {
+          // Not entirely within the source image
+
+          a_array[i] = b_array[i] = c_array[i] = d_array[i] = border_value_[0];
+
+          if (x0i < -1 || x0i >= static_cast<int>(src_width_) || y0i < -1 ||
+              y0i >= static_cast<int>(src_height_)) {
+            // Completely outside the source image
+            continue;
+          }
+
+          get_edge_pixels(a_array[i], b_array[i], c_array[i], d_array[i], x0i,
+                          y0i, offset, src_rows_, src_width_, src_height_);
+          continue;
+        }
+
+        // Completely inside the source image
+        a_array[i] = src_rows_[offset];
+        b_array[i] = src_rows_[offset + 1];
+        offset += src_rows_.stride();
+        c_array[i] = src_rows_[offset];
+        d_array[i] = src_rows_[offset + 1];
+      }
+
+      float32x4_t xfrac = vsubq_f32(xf, vrndmq_f32(xf));
+      float32x4_t yfrac = vsubq_f32(yf, vrndmq_f32(yf));
+      float32x4_t a = vcvtq_f32_u32(vld1q_u32(a_array));
+      float32x4_t b = vcvtq_f32_u32(vld1q_u32(b_array));
+      float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
+      float32x4_t c = vcvtq_f32_u32(vld1q_u32(c_array));
+      float32x4_t d = vcvtq_f32_u32(vld1q_u32(d_array));
+      float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
+      float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
+      return vcvtaq_u32_f32(result);
+    };
+
+    auto vector_path_4 = [&](size_t step) {  // step = 4*4 = 16
+      const float *ptr_mapx = &mapx[0];
+      const float *ptr_mapy = &mapy[0];
+      uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy);
+
+      ptr_mapx += kStep;
+      ptr_mapy += kStep;
+      uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy);
+      uint16x8_t result16_0 = vuzp1q_u16(res0, res1);
+
+      ptr_mapx += kStep;
+      ptr_mapy += kStep;
+      res0 = vector_path_1(ptr_mapx, ptr_mapy);
+
+      ptr_mapx += kStep;
+      ptr_mapy += kStep;
+      res1 = vector_path_1(ptr_mapx, ptr_mapy);
+      uint16x8_t result16_1 = vuzp1q_u16(res0, res1);
+      vst1q_u8(&dst[0], vuzp1q_u8(result16_0, result16_1));
+      mapx += ptrdiff_t(step);
+      mapy += ptrdiff_t(step);
+      dst += ptrdiff_t(step);
+    };
+
+    LoopUnroll loop{width, kStep};
+    loop.unroll_four_times(vector_path_4);
+    loop.unroll_once([&](size_t step) {
+      const float *ptr_mapx = &mapx[0];
+      const float *ptr_mapy = &mapy[0];
+      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
+      dst[0] = vgetq_lane_u32(result, 0);
+      dst[1] = vgetq_lane_u32(result, 1);
+      dst[2] = vgetq_lane_u32(result, 2);
+      dst[3] = vgetq_lane_u32(result, 3);
+      mapx += ptrdiff_t(step);
+      mapy += ptrdiff_t(step);
+      dst += ptrdiff_t(step);
+    });
+    ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) -
+                          static_cast<ptrdiff_t>(loop.remaining_length());
+    mapx -= back_step;
+    mapy -= back_step;
+    dst -= back_step;
+    loop.remaining([&](size_t, size_t) {
+      const float *ptr_mapx = &mapx[0];
+      const float *ptr_mapy = &mapy[0];
+      uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy);
+      dst[0] = vgetq_lane_u32(result, 0);
+      dst[1] = vgetq_lane_u32(result, 1);
+      dst[2] = vgetq_lane_u32(result, 2);
+      dst[3] = vgetq_lane_u32(result, 3);
+    });
+  }
+
+ private:
+  Rows<const ScalarType> src_rows_;
+  size_t src_width_;
+  size_t src_height_;
+  const ScalarType *border_value_;
+};  // end of class RemapF32ConstantBorder<uint8_t>
+// NOLINTEND(readability-function-cognitive-complexity)
+
+// Most of the complexity comes from parameter checking.
+// NOLINTBEGIN(readability-function-cognitive-complexity)
+template <typename T>
+kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
+                           size_t src_height, T *dst, size_t dst_stride,
+                           size_t dst_width, size_t dst_height, size_t channels,
+                           float *mapx, size_t mapx_stride, float *mapy,
+                           size_t mapy_stride,
+                           kleidicv_interpolation_type_t interpolation,
+                           kleidicv_border_type_t border_type,
+                           [[maybe_unused]] const T *border_value) {
+  // may need to remove the maybe_unused
+  CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
+  CHECK_POINTER_AND_STRIDE(mapx, mapx_stride, dst_height);
+  CHECK_POINTER_AND_STRIDE(mapy, mapy_stride, dst_height);
+  CHECK_IMAGE_SIZE(src_width, src_height);
+  CHECK_IMAGE_SIZE(dst_width, dst_height);
+  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
+    return KLEIDICV_ERROR_NULL_POINTER;
+  }
+
+  if (!remap_f32_is_implemented<T>(src_stride, src_width, src_height, dst_width,
+                                   border_type, channels, interpolation)) {
+    return KLEIDICV_ERROR_NOT_IMPLEMENTED;
+  }
+
+  // Calculating in float32_t will only be precise until 24 bits
+  if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
+      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24)) {
+    return KLEIDICV_ERROR_RANGE;
+  }
+
+  Rows<const T> src_rows{src, src_stride, channels};
+  Rows<const float> mapx_rows{mapx, mapx_stride, 1};
+  Rows<const float> mapy_rows{mapy, mapy_stride, 1};
+  Rows<T> dst_rows{dst, dst_stride, channels};
+  Rectangle rect{dst_width, dst_height};
+
+  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
+    if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
+      RemapF32ConstantBorder<T, true> operation{src_rows, src_width, src_height,
+                                                border_value};
+      zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+    } else {
+      RemapF32ConstantBorder<T, false> operation{src_rows, src_width,
+                                                 src_height, border_value};
+      zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+    }
+  } else {
+    assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
+    if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
+      RemapF32Replicate<T, true> operation{src_rows, src_width, src_height};
+      zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+    } else {
+      RemapF32Replicate<T, false> operation{src_rows, src_width, src_height};
+      zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows);
+    }
+  }
+
+  return KLEIDICV_OK;
+}
+// NOLINTEND(readability-function-cognitive-complexity)
+
 #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type)                          \
   template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16<type>(          \
       const type *src, size_t src_stride, size_t src_width, size_t src_height, \
@@ -908,4 +1297,14 @@ KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint16_t);
 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t);
 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t);
 
+#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(type)                          \
+  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_f32<type>(          \
+      const type *src, size_t src_stride, size_t src_width, size_t src_height, \
+      type *dst, size_t dst_stride, size_t dst_width, size_t dst_height,       \
+      size_t channels, float *mapx, size_t mapx_stride, float *mapy,           \
+      size_t mapy_stride, kleidicv_interpolation_type_t interpolation,         \
+      kleidicv_border_type_t border_type, const type *border_value)
+
+KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t);
+
 }  // namespace kleidicv::neon
diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h
index 65c5afe1191b41db63103bc776daebe89ca439d7..8cf574b91c0aa3b5579a1a2396406c187e7b18ae 100644
--- a/kleidicv/src/transform/remap_sc.h
+++ b/kleidicv/src/transform/remap_sc.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -15,6 +15,7 @@
 #include <limits>
 #include <memory>
 
+#include "common_sc.h"
 #include "kleidicv/sve2.h"
 #include "kleidicv/transform/remap.h"
 
@@ -822,6 +823,150 @@ kleidicv_error_t remap_s16point5_sc(
 }
 // NOLINTEND(readability-function-cognitive-complexity)
 
+template <typename ScalarType, bool IsLarge,
+          kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
+void remap32f_process_rows(Rows<const ScalarType> src_rows, size_t src_width,
+                           size_t src_height, const ScalarType* border_value,
+                           Rows<ScalarType> dst_rows, size_t dst_width,
+                           size_t y_begin, size_t y_end,
+                           Rows<const float> mapx_rows,
+                           Rows<const float> mapy_rows) {
+  svbool_t pg_all32 = svptrue_b32();
+  svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
+  svuint32_t sv_ymax = svdup_n_u32(src_height - 1);
+  svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride());
+  svuint32_t sv_border;
+  // sv_border is only used if the border type is constant.
+  // If the border type is not constant then border_value is permitted to be
+  // null and must not be read.
+  if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
+    sv_border = svdup_n_u32(border_value[0]);
+  }
+
+  svfloat32_t xmaxf = svdup_n_f32(static_cast<float>(src_width - 1));
+  svfloat32_t ymaxf = svdup_n_f32(static_cast<float>(src_height - 1));
+
+  const size_t kStep = VecTraits<float>::num_lanes();
+
+  // auto get_coordinates = [&](svbool_t pg, size_t xs) {
+  auto coordinate_getter = [&](svbool_t pg, size_t xs) {
+    auto x = static_cast<ptrdiff_t>(xs);
+    return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]),
+                     svld1_f32(pg, &mapy_rows.as_columns()[x]));
+  };
+
+  auto calculate_linear = [&](svbool_t pg, uint32_t x) {
+    if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
+      svfloat32x2_t coords = coordinate_getter(pg, x);
+      return calculate_linear_replicate<ScalarType, IsLarge>(
+          pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows);
+    } else {
+      static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
+      svfloat32x2_t coords = coordinate_getter(pg, x);
+      return calculate_linear_constant_border<ScalarType, IsLarge>(
+          pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
+    }
+  };
+
+  auto process_row = [&]() {
+    Columns<ScalarType> dst = dst_rows.as_columns();
+    LoopUnroll2 loop{dst_width, kStep};
+    // GCOVR_EXCL_START
+    if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
+      assert(!"INTER_NEAREST not implemented for RemapF32");
+      // GCOVR_EXCL_STOP
+    } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) {
+      loop.unroll_four_times([&](size_t x) {
+        ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svuint32_t res0 = calculate_linear(pg_all32, x);
+        x += kStep;
+        svuint32_t res1 = calculate_linear(pg_all32, x);
+        svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0),
+                                           svreinterpret_u16_u32(res1));
+        x += kStep;
+        res0 = calculate_linear(pg_all32, x);
+        x += kStep;
+        res1 = calculate_linear(pg_all32, x);
+        svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0),
+                                           svreinterpret_u16_u32(res1));
+        svst1_u8(svptrue_b8(), p_dst,
+                 svuzp1_u8(svreinterpret_u8_u16(result16_0),
+                           svreinterpret_u8_u16(result16_1)));
+      });
+      loop.unroll_once([&](size_t x) {
+        ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svuint32_t result = calculate_linear(pg_all32, x);
+        svst1b_u32(pg_all32, p_dst, result);
+      });
+      loop.remaining([&](size_t x, size_t x_max) {
+        ScalarType* p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svbool_t pg32 = svwhilelt_b32(x, x_max);
+        svuint32_t result = calculate_linear(pg32, x);
+        svst1b_u32(pg32, p_dst, result);
+      });
+    } else {
+      static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST ||
+                        Inter == KLEIDICV_INTERPOLATION_LINEAR,
+                    ": Unknown interpolation type!");
+    }
+    ++mapx_rows;
+    ++mapy_rows;
+  };
+
+  for (size_t y = y_begin; y < y_end; ++y) {
+    process_row();
+    ++dst_rows;
+  }
+}
+
+// Most of the complexity comes from parameter checking.
+// NOLINTBEGIN(readability-function-cognitive-complexity)
+template <typename T>
+kleidicv_error_t remap_f32_sc(const T* src, size_t src_stride, size_t src_width,
+                              size_t src_height, T* dst, size_t dst_stride,
+                              size_t dst_width, size_t dst_height,
+                              size_t channels, float* mapx, size_t mapx_stride,
+                              float* mapy, size_t mapy_stride,
+                              kleidicv_interpolation_type_t interpolation,
+                              kleidicv_border_type_t border_type,
+                              [[maybe_unused]] const T* border_value) {
+  // may need to remove the maybe_unused
+  CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
+  CHECK_POINTER_AND_STRIDE(mapx, mapx_stride, dst_height);
+  CHECK_POINTER_AND_STRIDE(mapy, mapy_stride, dst_height);
+  CHECK_IMAGE_SIZE(src_width, src_height);
+  CHECK_IMAGE_SIZE(dst_width, dst_height);
+  if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
+    return KLEIDICV_ERROR_NULL_POINTER;
+  }
+
+  if (!remap_f32_is_implemented<T>(src_stride, src_width, src_height, dst_width,
+                                   border_type, channels, interpolation)) {
+    return KLEIDICV_ERROR_NOT_IMPLEMENTED;
+  }
+
+  // Calculating in float32_t will only be precise until 24 bits
+  if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
+      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24)) {
+    return KLEIDICV_ERROR_RANGE;
+  }
+
+  Rows<const T> src_rows{src, src_stride, channels};
+  Rows<const float> mapx_rows{mapx, mapx_stride, 1};
+  Rows<const float> mapy_rows{mapy, mapy_stride, 1};
+  Rows<T> dst_rows{dst, dst_stride, channels};
+  Rectangle rect{dst_width, dst_height};
+
+  remap32f_process_rows<T>(remap_image_is_large(src_rows, src_height),
+                           interpolation, border_type, src_rows, src_width,
+                           src_height, border_value, dst_rows, dst_width, 0,
+                           dst_height, mapx_rows, mapy_rows);
+
+  return KLEIDICV_OK;
+}
+// NOLINTEND(readability-function-cognitive-complexity)
+
 }  // namespace KLEIDICV_TARGET_NAMESPACE
 
 #endif  // KLEIDICV_REMAP_SC_H
diff --git a/kleidicv/src/transform/remap_sve2.cpp b/kleidicv/src/transform/remap_sve2.cpp
index 0777091d701349b6b91e9ce98d28ac91a4194655..624b7ce2479616e6ed86a4a73ceec7e160cb9582 100644
--- a/kleidicv/src/transform/remap_sve2.cpp
+++ b/kleidicv/src/transform/remap_sve2.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -33,6 +33,21 @@ kleidicv_error_t remap_s16point5(const T *src, size_t src_stride,
                                border_type, border_value);
 }
 
+template <typename T>
+kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width,
+                           size_t src_height, T *dst, size_t dst_stride,
+                           size_t dst_width, size_t dst_height, size_t channels,
+                           float *mapx, size_t mapx_stride, float *mapy,
+                           size_t mapy_stride,
+                           kleidicv_interpolation_type_t interpolation,
+                           kleidicv_border_type_t border_type,
+                           const T *border_value) {
+  return remap_f32_sc<uint8_t>(src, src_stride, src_width, src_height, dst,
+                               dst_stride, dst_width, dst_height, channels,
+                               mapx, mapx_stride, mapy, mapy_stride,
+                               interpolation, border_type, border_value);
+}
+
 #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type)                          \
   template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16<type>(          \
       const type *src, size_t src_stride, size_t src_width, size_t src_height, \
@@ -54,4 +69,14 @@ KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint16_t);
 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t);
 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t);
 
+#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(type)                          \
+  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_f32<type>(          \
+      const type *src, size_t src_stride, size_t src_width, size_t src_height, \
+      type *dst, size_t dst_stride, size_t dst_width, size_t dst_height,       \
+      size_t channels, float *mapx, size_t mapx_stride, float *mapy,           \
+      size_t mapy_stride, kleidicv_interpolation_type_t interpolation,         \
+      kleidicv_border_type_t border_type, const type *border_value)
+
+KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t);
+
 }  // namespace kleidicv::sve2
diff --git a/kleidicv/src/transform/warp_perspective_sc.h b/kleidicv/src/transform/warp_perspective_sc.h
index 20cda8d79922baf44c761f9983563df2ed2dc6f9..9656d318c5eb28f8181d5c51b9e8e14542cb97f0 100644
--- a/kleidicv/src/transform/warp_perspective_sc.h
+++ b/kleidicv/src/transform/warp_perspective_sc.h
@@ -8,6 +8,7 @@
 #include <cmath>
 #include <utility>
 
+#include "common_sc.h"
 #include "kleidicv/ctypes.h"
 #include "kleidicv/kleidicv.h"
 #include "kleidicv/sve2.h"
@@ -41,14 +42,12 @@ namespace KLEIDICV_TARGET_NAMESPACE {
 
 template <typename ScalarType, bool IsLarge,
           kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border>
-void warp_perspective_operation(Rows<const ScalarType> src_rows,
-                                size_t src_width, size_t src_height,
-                                const float transform[9],
-                                const ScalarType *border_value,
-                                Rows<ScalarType> dst_rows, size_t dst_width,
-                                size_t y_begin, size_t y_end) {
+void remap32f_process_rows(Rows<const ScalarType> src_rows, size_t src_width,
+                           size_t src_height, const ScalarType *border_value,
+                           Rows<ScalarType> dst_rows, size_t dst_width,
+                           size_t y_begin, size_t y_end,
+                           const float transform[9]) {
   svbool_t pg_all32 = svptrue_b32();
-  svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1));
   svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
   svuint32_t sv_ymax = svdup_n_u32(src_height - 1);
   svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride());
@@ -60,16 +59,18 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     sv_border = svdup_n_u32(border_value[0]);
   }
 
-  svfloat32_t T0 = svdup_n_f32(transform[0]);
-  svfloat32_t T3 = svdup_n_f32(transform[3]);
-  svfloat32_t T6 = svdup_n_f32(transform[6]);
-  svfloat32_t tx0, ty0, tw0;
   svfloat32_t xmaxf = svdup_n_f32(static_cast<float>(src_width - 1));
   svfloat32_t ymaxf = svdup_n_f32(static_cast<float>(src_height - 1));
 
   const size_t kStep = VecTraits<float>::num_lanes();
 
-  auto calculate_coordinates = [&](size_t x) {
+  svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1));
+  svfloat32_t T0 = svdup_n_f32(transform[0]);
+  svfloat32_t T3 = svdup_n_f32(transform[3]);
+  svfloat32_t T6 = svdup_n_f32(transform[6]);
+  svfloat32_t tx0, ty0, tw0;
+
+  auto coordinate_getter = [&](svbool_t, size_t x) {
     svfloat32_t vx = svadd_n_f32_x(pg_all32, sv_0123, static_cast<float>(x));
     // Calculate half-transformed values from the first few pixel values,
     // plus Tn*x, similarly to the one above
@@ -84,8 +85,8 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
                      svmul_f32_x(pg_all32, ty, iw));
   };
 
-  auto calculate_nearest_coordinates = [&](size_t x) {
-    svfloat32x2_t coords = calculate_coordinates(x);
+  auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) {
+    svfloat32x2_t coords = coordinate_getter(pg32, x);
     svfloat32_t xf = svget2(coords, 0);
     svfloat32_t yf = svget2(coords, 1);
 
@@ -109,32 +110,11 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     return svcreate2(xi, yi);
   };
 
-  auto load = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
-    if constexpr (IsLarge) {
-      svbool_t pg_b = pg;
-      svbool_t pg_t = svtrn2_b32(pg, svpfalse());
-
-      // Calculate offsets from coordinates (y * stride + x)
-      // To avoid losing precision, the final offsets should be in 64 bits
-      svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride);
-      svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride);
-      // Copy pixels from source
-      svuint64_t result_b =
-          svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
-      svuint64_t result_t =
-          svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
-      return svtrn1_u32(svreinterpret_u32_u64(result_b),
-                        svreinterpret_u32_u64(result_t));
-    } else {
-      svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
-      return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets);
-    }
-  };
-
   auto get_pixels_or_border = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
     svbool_t in_range =
         svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
-    svuint32_t result = load(in_range, x, y);
+    svuint32_t result = load_common<ScalarType, IsLarge>(
+        in_range, x, y, sv_src_stride, src_rows);
     // Select between source pixels and border colour
     return svsel_u32(in_range, result, sv_border);
   };
@@ -146,19 +126,22 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
       if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
         return get_pixels_or_border(pg_all32, x, y);
       } else {
-        return load(pg_all32, x, y);
+        return load_common<ScalarType, IsLarge>(pg_all32, x, y, sv_src_stride,
+                                                src_rows);
       }
     };
     ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
-    svuint32_t res32_0 = load_source(calculate_nearest_coordinates(x));
+    svuint32_t res32_0 =
+        load_source(calculate_nearest_coordinates(pg_all32, x));
     x += kStep;
-    svuint32_t res32_1 = load_source(calculate_nearest_coordinates(x));
+    svuint32_t res32_1 =
+        load_source(calculate_nearest_coordinates(pg_all32, x));
     svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
                                     svreinterpret_u16_u32(res32_1));
     x += kStep;
-    res32_0 = load_source(calculate_nearest_coordinates(x));
+    res32_0 = load_source(calculate_nearest_coordinates(pg_all32, x));
     x += kStep;
-    res32_1 = load_source(calculate_nearest_coordinates(x));
+    res32_1 = load_source(calculate_nearest_coordinates(pg_all32, x));
     svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
                                     svreinterpret_u16_u32(res32_1));
     svuint8_t result =
@@ -171,7 +154,7 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     size_t length = x_max - x;
     svbool_t pg32 = svwhilelt_b32(0ULL, length);
 
-    svuint32x2_t coords = calculate_nearest_coordinates(x);
+    svuint32x2_t coords = calculate_nearest_coordinates(pg32, x);
     svuint32_t xi = svget2(coords, 0);
     svuint32_t yi = svget2(coords, 1);
 
@@ -179,109 +162,26 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
       result = get_pixels_or_border(pg32, xi, yi);
     } else {
-      result = load(pg32, xi, yi);
+      result = load_common<ScalarType, IsLarge>(pg32, xi, yi, sv_src_stride,
+                                                src_rows);
     }
     svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
   };
 
-  auto calculate_linear_replicate = [&](svbool_t pg, uint32_t x) {
-    auto load_source = [&](svuint32_t x, svuint32_t y) {
-      return load(pg, x, y);
-    };
-
-    svfloat32x2_t coords = calculate_coordinates(x);
-    svfloat32_t xf = svget2(coords, 0);
-    svfloat32_t yf = svget2(coords, 1);
-    // Take the integer part, clamp it to within the dimensions of the
-    // source image (negative values are already saturated to 0)
-    svuint32_t x0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, xf, xmaxf));
-    svuint32_t y0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, yf, ymaxf));
-
-    // Get fractional part, or 0 if out of range
-    svbool_t x_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, xf, 0.F),
-                                  svcmplt_f32(pg_all32, xf, xmaxf));
-    svbool_t y_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, yf, 0.F),
-                                  svcmplt_f32(pg_all32, yf, ymaxf));
-    svfloat32_t xfrac = svsel_f32(
-        x_in_range, svsub_f32_x(pg_all32, xf, svrintm_x(pg_all32, xf)),
-        svdup_n_f32(0.F));
-    svfloat32_t yfrac = svsel_f32(
-        y_in_range, svsub_f32_x(pg_all32, yf, svrintm_x(pg_all32, yf)),
-        svdup_n_f32(0.F));
-
-    // x1 = x0 + 1, except if it's already xmax or out of range
-    svuint32_t x1 = svsel_u32(x_in_range, svadd_n_u32_x(pg_all32, x0, 1), x0);
-    svuint32_t y1 = svsel_u32(y_in_range, svadd_n_u32_x(pg_all32, y0, 1), y0);
-
-    // Calculate offsets from coordinates (y * stride + x)
-    // a: top left, b: top right, c: bottom left, d: bottom right
-    svfloat32_t a = svcvt_f32_u32_x(pg_all32, load_source(x0, y0));
-    svfloat32_t b = svcvt_f32_u32_x(pg_all32, load_source(x1, y0));
-    svfloat32_t line0 =
-        svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
-    svfloat32_t c = svcvt_f32_u32_x(pg_all32, load_source(x0, y1));
-    svfloat32_t d = svcvt_f32_u32_x(pg_all32, load_source(x1, y1));
-    svfloat32_t line1 =
-        svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
-    svfloat32_t result = svmla_f32_x(
-        pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
-    return svmin_u32_x(
-        pg_all32, svdup_n_u32(0xFF),
-        svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F)));
-  };
-
-  auto calculate_linear_constant_border = [&](svbool_t pg, uint32_t x) {
-    svfloat32x2_t coords = calculate_coordinates(x);
-    svfloat32_t xf = svget2(coords, 0);
-    svfloat32_t yf = svget2(coords, 1);
-
-    // Convert obviously out-of-range coordinates to values that are just beyond
-    // the largest permitted image width & height. This avoids the need for
-    // special case handling elsewhere.
-    svfloat32_t big = svdup_n_f32(1 << 24);
-    xf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, xf), big), xf, big);
-    yf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, yf), big), yf, big);
-
-    svfloat32_t xf0 = svrintm_f32_x(pg, xf);
-    svfloat32_t yf0 = svrintm_f32_x(pg, yf);
-
-    svint32_t x0 = svcvt_s32_x(pg, xf0);
-    svint32_t y0 = svcvt_s32_x(pg, yf0);
-    svint32_t x1 = svadd_s32_x(pg, x0, svdup_n_s32(1));
-    svint32_t y1 = svadd_s32_x(pg, y0, svdup_n_s32(1));
-
-    svfloat32_t xfrac = svsub_f32_x(pg, xf, xf0);
-    svfloat32_t yfrac = svsub_f32_x(pg, yf, yf0);
-
-    svfloat32_t a =
-        svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x0),
-                                                 svreinterpret_u32_s32(y0)));
-    svfloat32_t b =
-        svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x1),
-                                                 svreinterpret_u32_s32(y0)));
-    svfloat32_t line0 = svmla_f32_x(pg, a, svsub_f32_x(pg, b, a), xfrac);
-    svfloat32_t c =
-        svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x0),
-                                                 svreinterpret_u32_s32(y1)));
-    svfloat32_t d =
-        svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x1),
-                                                 svreinterpret_u32_s32(y1)));
-    svfloat32_t line1 = svmla_f32_x(pg, c, svsub_f32_x(pg, d, c), xfrac);
-    svfloat32_t result =
-        svmla_f32_x(pg, line0, svsub_f32_x(pg, line1, line0), yfrac);
-    return svcvt_u32_f32_x(pg, svrinta_f32_x(pg, result));
-  };
-
   auto calculate_linear = [&](svbool_t pg, uint32_t x) {
     if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
-      return calculate_linear_replicate(pg, x);
+      svfloat32x2_t coords = coordinate_getter(pg, x);
+      return calculate_linear_replicate<ScalarType, IsLarge>(
+          pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows);
     } else {
       static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
-      return calculate_linear_constant_border(pg, x);
+      svfloat32x2_t coords = coordinate_getter(pg, x);
+      return calculate_linear_constant_border<ScalarType, IsLarge>(
+          pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
     }
   };
 
-  auto process_row = [&](size_t y, Columns<ScalarType> dst) {
+  auto process_row = [&](size_t y) {
     float fy = static_cast<float>(y);
     // Calculate half-transformed values at the first pixel (nominators)
     // tw =  T6*x + T7*y + T8
@@ -291,6 +191,7 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
     ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5]));
     tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8]));
 
+    Columns<ScalarType> dst = dst_rows.as_columns();
     LoopUnroll2 loop{dst_width, kStep};
     if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
       loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); });
@@ -336,54 +237,23 @@ void warp_perspective_operation(Rows<const ScalarType> src_rows,
   };
 
   for (size_t y = y_begin; y < y_end; ++y) {
-    process_row(y, dst_rows.as_columns());
+    process_row(y);
     ++dst_rows;
   }
 }
 
-// Convert border_type to a template argument.
-template <typename ScalarType, bool IsLarge,
-          kleidicv_interpolation_type_t Inter, typename... Args>
-void warp_perspective_operation(kleidicv_border_type_t border_type,
-                                Args &&...args) {
-  if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) {
-    warp_perspective_operation<ScalarType, IsLarge, Inter,
-                               KLEIDICV_BORDER_TYPE_REPLICATE>(
-        std::forward<Args>(args)...);
-  } else {
-    warp_perspective_operation<ScalarType, IsLarge, Inter,
-                               KLEIDICV_BORDER_TYPE_CONSTANT>(
-        std::forward<Args>(args)...);
-  }
-}
-
-// Convert interpolation_type to a template argument.
-template <typename ScalarType, bool IsLarge, typename... Args>
-void warp_perspective_operation(
-    kleidicv_interpolation_type_t interpolation_type, Args &&...args) {
-  if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) {
-    warp_perspective_operation<ScalarType, IsLarge,
-                               KLEIDICV_INTERPOLATION_NEAREST>(
-        std::forward<Args>(args)...);
-  } else {
-    warp_perspective_operation<ScalarType, IsLarge,
-                               KLEIDICV_INTERPOLATION_LINEAR>(
-        std::forward<Args>(args)...);
-  }
-}
-
 // Most of the complexity comes from parameter checking.
 // NOLINTBEGIN(readability-function-cognitive-complexity)
 template <typename T>
 KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc(
     const T *src, size_t src_stride, size_t src_width, size_t src_height,
     T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
-    size_t y_begin, size_t y_end, const float transformation[9],
-    size_t channels, kleidicv_interpolation_type_t interpolation,
+    size_t y_begin, size_t y_end, const float transform[9], size_t channels,
+    kleidicv_interpolation_type_t interpolation,
     kleidicv_border_type_t border_type, const T *border_value) {
   CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
   CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
-  CHECK_POINTERS(transformation);
+  CHECK_POINTERS(transform);
   CHECK_IMAGE_SIZE(src_width, src_height);
   CHECK_IMAGE_SIZE(dst_width, dst_height);
   if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
@@ -404,15 +274,10 @@ KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc(
 
   dst_rows += y_begin;
 
-  if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
-    warp_perspective_operation<T, true>(
-        interpolation, border_type, src_rows, src_width, src_height,
-        transformation, border_value, dst_rows, dst_width, y_begin, y_end);
-  } else {
-    warp_perspective_operation<T, false>(
-        interpolation, border_type, src_rows, src_width, src_height,
-        transformation, border_value, dst_rows, dst_width, y_begin, y_end);
-  }
+  remap32f_process_rows<T>(remap_image_is_large(src_rows, src_height),
+                           interpolation, border_type, src_rows, src_width,
+                           src_height, border_value, dst_rows, dst_width,
+                           y_begin, y_end, transform);
 
   return KLEIDICV_OK;
 }
diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
index 52eb762e60c8ccace05df93c01193f87ae0c4c8c..dfcdc7caec97bbfca6615946e88b1302d75c4dc8 100644
--- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
+++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -428,6 +428,18 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u16(
     kleidicv_border_type_t border_type, const uint16_t *border_value,
     kleidicv_thread_multithreading);
 
+/// Internal - not part of the public API and its direct use is not supported.
+///
+/// Multithreaded implementation of kleidicv_remap_f32_u8 - see the
+/// documentation of that function for more details.
+kleidicv_error_t kleidicv_thread_remap_f32_u8(
+    const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
+    uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
+    size_t channels, float *mapx, size_t mapx_stride, float *mapy,
+    size_t mapy_stride, kleidicv_interpolation_type_t interpolation,
+    kleidicv_border_type_t border_type, const uint8_t *border_value,
+    kleidicv_thread_multithreading);
+
 /// Internal - not part of the public API and its direct use is not supported.
 ///
 /// Multithreaded implementation of kleidicv_warp_perspective_u8 - see the
diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp
index d3c77db979b33d0a31f8921fc6e9f53dd865050c..2304946d457d8711a4dc2271235510aa6318c7e7 100644
--- a/kleidicv_thread/src/kleidicv_thread.cpp
+++ b/kleidicv_thread/src/kleidicv_thread.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -757,6 +757,31 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u16(
   return parallel_batches(callback, mt, dst_height);
 }
 
+kleidicv_error_t kleidicv_thread_remap_f32_u8(
+    const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
+    uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
+    size_t channels, float *mapx, size_t mapx_stride, float *mapy,
+    size_t mapy_stride, kleidicv_interpolation_type_t interpolation,
+    kleidicv_border_type_t border_type, const uint8_t *border_value,
+    kleidicv_thread_multithreading mt) {
+  if (!kleidicv::remap_f32_is_implemented<uint8_t>(
+          src_stride, src_width, src_height, dst_width, border_type, channels,
+          interpolation)) {
+    return KLEIDICV_ERROR_NOT_IMPLEMENTED;
+  }
+  auto callback = [=](unsigned begin, unsigned end) {
+    return kleidicv_remap_f32_u8(
+        src, src_stride, src_width, src_height,
+        dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint8_t)),
+        dst_stride, dst_width, end - begin, channels,
+        mapx + static_cast<ptrdiff_t>(begin * mapx_stride / sizeof(float)),
+        mapx_stride,
+        mapy + static_cast<ptrdiff_t>(begin * mapy_stride / sizeof(float)),
+        mapy_stride, interpolation, border_type, border_value);
+  };
+  return parallel_batches(callback, mt, dst_height);
+}
+
 kleidicv_error_t kleidicv_thread_warp_perspective_u8(
     const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
     uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt
index 5e8ec82971105e09ef22826f0ed3b1f8d597916a..fa7514a17148e676b8083f76440693857758d7d6 100755
--- a/scripts/benchmark/benchmarks.txt
+++ b/scripts/benchmark/benchmarks.txt
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -82,10 +82,14 @@ Remap_S16_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16
 Remap_S16_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)'
 Remap_S16Point5_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)'
 Remap_S16Point5_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)'
+Remap_F32_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)'
+
 Remap_S16_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_NEAREST, BORDER_CONSTANT)'
 Remap_S16_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_CONSTANT)'
 Remap_S16Point5_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)'
 Remap_S16Point5_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)'
+Remap_F32_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)'
+
 WarpPerspective_Nearest: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 1)'
 WarpPerspective_Linear: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 1)'
 WarpPerspective_Nearest_Constant: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_CONSTANT, 1)'
diff --git a/scripts/ci-opencv.sh b/scripts/ci-opencv.sh
index 8482cbc9808d5b7c35795c9ed1ac0826e6bbdbdd..d98b8c9c4111510eba651572f65a293eee565051 100755
--- a/scripts/ci-opencv.sh
+++ b/scripts/ci-opencv.sh
@@ -104,7 +104,8 @@ IMGPROC_TEST_PATTERNS=(
   '*Imgproc_Dilate*'
   '*Imgproc_Erode*'
   '*Imgproc_PyramidDown*'
-  '*Imgproc_Remap*'
+  '*Imgproc_Remap.*'
+  '*Imgproc_Remap_Test*'
   '*Imgproc_Warp*'
 )
 IMGPROC_TEST_PATTERNS_STR="$(join_strings_with_colon "${IMGPROC_TEST_PATTERNS[*]}")"
diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp
index 8dd36012799bb5535a3b7f4ada96f8d19568e749..72de4a4bb0431909dee8a543f80c451bd65bc0ab 100644
--- a/test/api/test_remap.cpp
+++ b/test/api/test_remap.cpp
@@ -1,9 +1,13 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
 #include <gtest/gtest.h>
 
+#include <climits>
+#include <cmath>
+#include <cstdint>
+
 #include "framework/array.h"
 #include "framework/generator.h"
 #include "framework/utils.h"
@@ -524,6 +528,26 @@ class RemapS16Point5 : public testing::Test {
     test::Array2D<ScalarType> expected{dst_total_width, dst_h, padding,
                                        channels};
 
+    const int64_t kMaxVal = std::numeric_limits<ScalarType>::max();
+    auto generateSource = [&](size_t x, size_t y) {
+      return static_cast<ScalarType>((x + y) % 2 ? kMaxVal : 0);
+    };
+    for (size_t y = 0; y < src_h; ++y) {
+      *source.at(y, 0) = generateSource(y, 0);
+      *source.at(y, 1) = generateSource(y, 1);
+      *source.at(y, 2) = generateSource(y, 2);
+      *source.at(y, src_w - 3) = generateSource(y, src_w - 3);
+      *source.at(y, src_w - 2) = generateSource(y, src_w - 2);
+      *source.at(y, src_w - 1) = generateSource(y, src_w - 1);
+    }
+    for (size_t x = 0; x < src_w; ++x) {
+      *source.at(0, x) = generateSource(0, x);
+      *source.at(1, x) = generateSource(1, x);
+      *source.at(2, x) = generateSource(2, x);
+      *source.at(src_h - 3, x) = generateSource(src_h - 3, x);
+      *source.at(src_h - 2, x) = generateSource(src_h - 2, x);
+      *source.at(src_h - 1, x) = generateSource(src_h - 1, x);
+    }
     actual.fill(42);
 
     calculate_expected(source, mapxy, mapfrac, border_type, border_value,
@@ -826,3 +850,534 @@ TYPED_TEST(RemapS16Point5, UnsupportedTooSmallImage) {
           src, 1 * sizeof(TypeParam), 1, 1, dst, 8 * sizeof(TypeParam), 7, 1, 1,
           mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
 }
+
+template <class ScalarType>
+class RemapF32 : public testing::Test {
+ public:
+  static void test_random(size_t src_w, size_t src_h, size_t dst_w,
+                          size_t dst_h, size_t channels,
+                          kleidicv_border_type_t border_type,
+                          const ScalarType *border_value, size_t padding) {
+    test::Array2D<float> mapx(dst_w, dst_h, padding);
+    test::PseudoRandomNumberGenerator<float> xcoord_generator;
+    mapx.fill(xcoord_generator);
+    test::Array2D<float> mapy(dst_w, dst_h, padding);
+    test::PseudoRandomNumberGenerator<float> ycoord_generator;
+    mapy.fill(ycoord_generator);
+    execute_test(mapx, mapy, src_w, src_h, dst_w, dst_h, channels, border_type,
+                 border_value, padding);
+  }
+
+  static void test_outside_random(size_t src_w, size_t src_h, size_t dst_w,
+                                  size_t dst_h, size_t channels,
+                                  kleidicv_border_type_t border_type,
+                                  const ScalarType *border_value,
+                                  size_t padding) {
+    test::Array2D<float> mapx(dst_w, dst_h, padding);
+    test::PseudoRandomNumberGeneratorFloatRange<float> xcoord_generator{
+        // static_cast<float>(-src_w), static_cast<float>(2 * src_w)}; -src_w
+        // overflow
+        static_cast<float>(-static_cast<int64_t>(src_w)),
+        static_cast<float>(2 * src_w)};
+    mapx.fill(xcoord_generator);
+    test::Array2D<float> mapy(dst_w, dst_h, padding);
+    test::PseudoRandomNumberGeneratorFloatRange<float> ycoord_generator{
+        static_cast<float>(-static_cast<int64_t>(src_h)),
+        static_cast<float>(2 * src_h)};
+    mapy.fill(ycoord_generator);
+    execute_test(mapx, mapy, src_w, src_h, dst_w, dst_h, channels, border_type,
+                 border_value, padding);
+  }
+
+  static void test_blend(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h,
+                         size_t channels, kleidicv_border_type_t border_type,
+                         const ScalarType *border_value, size_t padding) {
+    test::Array2D<float> mapx(dst_w, dst_h, padding);
+    test::Array2D<float> mapy(dst_w, dst_h, padding);
+    for (size_t row = 0; row < dst_h; ++row) {
+      for (size_t column = 0; column < dst_w; ++column) {
+        // Use a second degree function to add a nonlinear blend to the image
+        auto r = static_cast<double>(row), c = static_cast<double>(column),
+             w = static_cast<double>(dst_w), h = static_cast<double>(dst_h);
+        double x = c * 2 - c * c / w;
+        double y = r * (w - c) / w + 4 * r / h;
+        *mapx.at(row, column) = std::max<float>(
+            0, std::min(static_cast<float>(src_w - 1), static_cast<float>(x)));
+        *mapy.at(row, column) = std::max<float>(
+            0, std::min(static_cast<float>(src_h - 1), static_cast<float>(y)));
+      }
+    }
+    execute_test(mapx, mapy, src_w, src_h, dst_w, dst_h, channels, border_type,
+                 border_value, padding);
+  }
+
+  // Test coordinates with edge values that may easily overflow
+  static void test_corner_cases(size_t src_w, size_t src_h, size_t dst_w,
+                                size_t dst_h, size_t channels,
+                                kleidicv_border_type_t border_type,
+                                const ScalarType *border_value,
+                                size_t padding) {
+    test::Array2D<float> mapx(dst_w, dst_h, padding);
+    test::Array2D<float> mapy(dst_w, dst_h, padding);
+    const float corner_x_values[] = {std::numeric_limits<float>::min(),
+                                     -1.8,
+                                     -0.3,
+                                     0.2,
+                                     static_cast<float>(src_w) - 1.5F,
+                                     static_cast<float>(src_w) - 0.93F,
+                                     static_cast<float>(src_w) + 0.1F,
+                                     static_cast<float>(src_w) + 1.2F,
+                                     std::numeric_limits<float>::max()};
+    const float corner_y_values[] = {std::numeric_limits<float>::min(),
+                                     -1.3,
+                                     -0.7,
+                                     0.33,
+                                     1.1,
+                                     static_cast<float>(src_h) - 1.8F,
+                                     static_cast<float>(src_h) - 0.77F,
+                                     static_cast<float>(src_h) + 0.17F,
+                                     static_cast<float>(src_h) + 1.06F,
+                                     std::numeric_limits<float>::max()};
+    const size_t nx = sizeof(corner_x_values) / sizeof(float);
+    const size_t ny = sizeof(corner_y_values) / sizeof(float);
+    size_t counter = 0;
+    for (size_t row = 0; row < dst_h; ++row) {
+      for (size_t column = 0; column < dst_w; ++column) {
+        *mapx.at(row, column) = corner_x_values[counter % nx];
+        *mapy.at(row, column) = corner_y_values[counter % ny];
+        ++counter;
+      }
+    }
+
+    // This part is the same as execute_test() but without initializing source.
+    // Corner Cases use the biggest possible source.
+    size_t src_total_width = channels * src_w;
+    size_t dst_total_width = channels * dst_w;
+
+    test::Array2D<ScalarType> source{src_total_width, src_h, padding, channels};
+    test::Array2D<ScalarType> actual{dst_total_width, dst_h, padding, channels};
+    test::Array2D<ScalarType> expected{dst_total_width, dst_h, padding,
+                                       channels};
+
+    const int64_t kMaxVal = std::numeric_limits<ScalarType>::max();
+    auto generateSource = [&](size_t x, size_t y) {
+      return static_cast<ScalarType>((x + y) % 2 ? kMaxVal : 27);
+    };
+    for (size_t y = 0; y < src_h; ++y) {
+      *source.at(y, 0) = generateSource(y, 0);
+      *source.at(y, 1) = generateSource(y, 1);
+      *source.at(y, 2) = generateSource(y, 2);
+      *source.at(y, src_w - 3) = generateSource(y, src_w - 3);
+      *source.at(y, src_w - 2) = generateSource(y, src_w - 2);
+      *source.at(y, src_w - 1) = generateSource(y, src_w - 1);
+    }
+    for (size_t x = 0; x < src_w; ++x) {
+      *source.at(0, x) = generateSource(0, x);
+      *source.at(1, x) = generateSource(1, x);
+      *source.at(2, x) = generateSource(2, x);
+      *source.at(src_h - 3, x) = generateSource(src_h - 3, x);
+      *source.at(src_h - 2, x) = generateSource(src_h - 2, x);
+      *source.at(src_h - 1, x) = generateSource(src_h - 1, x);
+    }
+
+    test::PseudoRandomNumberGenerator<ScalarType> generator;
+    actual.fill(42);
+
+    calculate_expected(source, mapx, mapy, border_type, border_value, expected);
+
+    ASSERT_EQ(
+        KLEIDICV_OK,
+        kleidicv_remap_f32_u8(
+            source.data(), source.stride(), source.width(), source.height(),
+            actual.data(), actual.stride(), actual.width(), actual.height(),
+            channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(),
+            KLEIDICV_INTERPOLATION_LINEAR, border_type, border_value));
+
+    EXPECT_EQ_ARRAY2D(actual, expected);
+  }
+
+ private:
+  static void execute_test(test::Array2D<float> &mapx,
+                           test::Array2D<float> &mapy, size_t src_w,
+                           size_t src_h, size_t dst_w, size_t dst_h,
+                           size_t channels, kleidicv_border_type_t border_type,
+                           const ScalarType *border_value, size_t padding) {
+    size_t src_total_width = channels * src_w;
+    size_t dst_total_width = channels * dst_w;
+
+    test::Array2D<ScalarType> source{src_total_width, src_h, padding, channels};
+    test::Array2D<ScalarType> actual{dst_total_width, dst_h, padding, channels};
+    test::Array2D<ScalarType> expected{dst_total_width, dst_h, padding,
+                                       channels};
+    test::PseudoRandomNumberGenerator<ScalarType> generator;
+    source.fill(generator);
+    actual.fill(42);
+
+    calculate_expected(source, mapx, mapy, border_type, border_value, expected);
+
+    ASSERT_EQ(
+        KLEIDICV_OK,
+        kleidicv_remap_f32_u8(
+            source.data(), source.stride(), source.width(), source.height(),
+            actual.data(), actual.stride(), actual.width(), actual.height(),
+            channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(),
+            KLEIDICV_INTERPOLATION_LINEAR, border_type, border_value));
+
+    EXPECT_EQ_ARRAY2D(actual, expected);
+  }
+
+  static void calculate_expected(test::Array2D<ScalarType> &src,
+                                 test::Array2D<float> &mapx,
+                                 test::Array2D<float> &mapy,
+                                 kleidicv_border_type_t border_type,
+                                 const ScalarType *border_value,
+                                 test::Array2D<ScalarType> &expected) {
+    auto get_src = [&](ptrdiff_t x, ptrdiff_t y) {
+      return get_array2d_element_or_border(src, x, y, border_type,
+                                           border_value);
+    };
+
+    for (size_t row = 0; row < expected.height(); row++) {
+      for (size_t column = 0; column < expected.width() / src.channels();
+           ++column) {
+        for (size_t ch = 0; ch < src.channels(); ++ch) {
+          float x = *mapx.at(row, column);
+          float y = *mapy.at(row, column);
+          // ptrdiff_t ix = std::floor(x);
+          // ptrdiff_t iy = std::floor(y);
+          ptrdiff_t ix = static_cast<ptrdiff_t>(std::max<float>(
+              INT_MIN,
+              std::min<float>(std::floor(x),
+                              static_cast<float>(KLEIDICV_MAX_IMAGE_PIXELS))));
+          ptrdiff_t iy = static_cast<ptrdiff_t>(std::max<float>(
+              INT_MIN,
+              std::min<float>(std::floor(y),
+                              static_cast<float>(KLEIDICV_MAX_IMAGE_PIXELS))));
+          float xfrac = x - std::floor(x);
+          float yfrac = y - std::floor(y);
+          float a = get_src(ix, iy)[ch];
+          float b = get_src(ix + 1, iy)[ch];
+          float c = get_src(ix, iy + 1)[ch];
+          float d = get_src(ix + 1, iy + 1)[ch];
+          float line1 = (b - a) * xfrac + a;
+          float line2 = (d - c) * xfrac + c;
+          float float_result = (line2 - line1) * yfrac + line1;
+          *expected.at(row, column * src.channels() + ch) =
+              static_cast<ScalarType>(std::lround(float_result));
+        }
+      }
+    }
+  }
+};
+
+using RemapF32ElementTypes = ::testing::Types<uint8_t>;
+TYPED_TEST_SUITE(RemapF32, RemapF32ElementTypes);
+
+TYPED_TEST(RemapF32, RandomNoPadding) {
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  size_t channels = 1;
+  size_t padding = 0;
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels, border_type,
+                             border_value, padding);
+  }
+}
+
+TYPED_TEST(RemapF32, BlendPadding) {
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  size_t channels = 1;
+  size_t padding = 13;
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type,
+                            border_value, padding);
+  }
+}
+
+TYPED_TEST(RemapF32, OutsideRandomPadding) {
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  size_t channels = 1;
+  size_t padding = 13;
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels,
+                                     border_type, border_value, padding);
+  }
+}
+
+TYPED_TEST(RemapF32, BlendBigStride) {
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 2;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  size_t channels = 1;
+  size_t padding = 1 << 16;
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type,
+                            border_value, padding);
+  }
+}
+
+TYPED_TEST(RemapF32, CornerCases) {
+  size_t src_w = (1ULL << 12) - 1;
+  size_t src_h = (1ULL << 12) - 1;
+  size_t dst_w = 4;
+  size_t dst_h = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t channels = 1;
+  size_t padding = 17;
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels,
+                                   border_type, border_value, padding);
+  }
+}
+
+// Test for load_src_into_floats_large
+TYPED_TEST(RemapF32, CornerCasesLargeLoad) {
+  // TODO: It takes long to run!
+  size_t src_w = 1ULL << 18;
+  size_t src_h = 1ULL << 14;
+  size_t dst_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t dst_h = 4;
+  size_t channels = 1;
+  size_t padding = 1;
+  for (auto [border_type, border_value] : get_borders<TypeParam>()) {
+    TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels,
+                                   border_type, border_value, padding);
+  }
+}
+
+TYPED_TEST(RemapF32, NullPointer) {
+  const size_t element_size = sizeof(TypeParam);
+  const size_t src_width = 2;
+  const size_t src_height = 2;
+  const size_t src_stride = src_width * element_size;
+  const size_t dst_width = 1;
+  const size_t dst_height = 1;
+  const size_t dst_stride = dst_width * element_size;
+  const TypeParam src[4] = {};
+  TypeParam dst[1];
+  const size_t channels = 1;
+  float mapx[1] = {};
+  const size_t mapx_stride = dst_width * sizeof(float);
+  float mapy[1] = {};
+  const size_t mapy_stride = dst_width * sizeof(float);
+  const TypeParam border_value[1] = {};
+  test::test_null_args(kleidicv_remap_f32_u8, src, src_stride, src_width,
+                       src_height, dst, dst_stride, dst_width, dst_height,
+                       channels, mapx, mapx_stride, mapy, mapy_stride,
+                       KLEIDICV_INTERPOLATION_LINEAR,
+                       KLEIDICV_BORDER_TYPE_CONSTANT, border_value);
+}
+
+TYPED_TEST(RemapF32, ZeroImageSize) {
+  const TypeParam src[1] = {};
+  TypeParam dst[1];
+  const size_t src_stride = sizeof(TypeParam);
+  const size_t dst_stride = sizeof(TypeParam);
+  float mapx[1] = {};
+  float mapy[1] = {};
+  const size_t mapx_stride = sizeof(float);
+  const size_t mapy_stride = sizeof(float);
+
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            kleidicv_remap_f32_u8(src, src_stride, 0, 1, dst, dst_stride, 0, 1,
+                                  1, mapx, mapx_stride, mapy, mapy_stride,
+                                  KLEIDICV_INTERPOLATION_LINEAR,
+                                  KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            kleidicv_remap_f32_u8(src, src_stride, 1, 0, dst, dst_stride, 1, 0,
+                                  1, mapx, mapx_stride, mapy, mapy_stride,
+                                  KLEIDICV_INTERPOLATION_LINEAR,
+                                  KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+}
+
+TYPED_TEST(RemapF32, InvalidImageSize) {
+  const size_t element_size = sizeof(TypeParam);
+  const TypeParam src[1] = {};
+  TypeParam dst[1];
+  float mapx[1] = {};
+  float mapy[1] = {};
+
+  EXPECT_EQ(
+      KLEIDICV_ERROR_RANGE,
+      kleidicv_remap_f32_u8(src, element_size, KLEIDICV_MAX_IMAGE_PIXELS + 1, 1,
+                            dst, element_size, 1, 1, 1, mapx, sizeof(float),
+                            mapy, sizeof(float), KLEIDICV_INTERPOLATION_LINEAR,
+                            KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            kleidicv_remap_f32_u8(src, element_size, KLEIDICV_MAX_IMAGE_PIXELS,
+                                  KLEIDICV_MAX_IMAGE_PIXELS, dst, element_size,
+                                  1, 1, 1, mapx, sizeof(float), mapy,
+                                  sizeof(float), KLEIDICV_INTERPOLATION_LINEAR,
+                                  KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, element_size,
+                                  KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 1, mapx,
+                                  sizeof(float), mapy, sizeof(float),
+                                  KLEIDICV_INTERPOLATION_LINEAR,
+                                  KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+
+  EXPECT_EQ(
+      KLEIDICV_ERROR_RANGE,
+      kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, element_size,
+                            KLEIDICV_MAX_IMAGE_PIXELS,
+                            KLEIDICV_MAX_IMAGE_PIXELS, 1, mapx, sizeof(float),
+                            mapy, sizeof(float), KLEIDICV_INTERPOLATION_LINEAR,
+                            KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+}
+
+TYPED_TEST(RemapF32, UnsupportedTwoChannels) {
+  const size_t element_size = sizeof(TypeParam);
+  const TypeParam src[1] = {};
+  TypeParam dst[16];
+  float mapx[16] = {};
+  float mapy[16] = {};
+  const size_t channels = 2;
+
+  EXPECT_EQ(
+      KLEIDICV_ERROR_NOT_IMPLEMENTED,
+      kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 16,
+                            1, channels, mapx, 16 * sizeof(float), mapy,
+                            16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR,
+                            KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+}
+
+TYPED_TEST(RemapF32, UnsupportedInterpolationTypeNEAREST) {
+  const size_t element_size = sizeof(TypeParam);
+  const TypeParam src[1] = {};
+  TypeParam dst[16];
+  float mapx[16] = {};
+  float mapy[16] = {};
+
+  EXPECT_EQ(
+      KLEIDICV_ERROR_NOT_IMPLEMENTED,
+      kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 16,
+                            1, 1, mapx, 16 * sizeof(float), mapy,
+                            16 * sizeof(float), KLEIDICV_INTERPOLATION_NEAREST,
+                            KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+}
+
+TYPED_TEST(RemapF32, UnsupportedTooSmallImage) {
+  const size_t element_size = sizeof(TypeParam);
+  const TypeParam src[1] = {};
+  TypeParam dst[16];
+  float mapx[16] = {};
+  float mapy[16] = {};
+
+  EXPECT_EQ(
+      KLEIDICV_ERROR_NOT_IMPLEMENTED,
+      kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 3,
+                            1, 1, mapx, 16 * sizeof(float), mapy,
+                            16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR,
+                            KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+}
+
+TYPED_TEST(RemapF32, UnsupportedBigStride) {
+  const size_t element_size = sizeof(TypeParam);
+  const TypeParam src[1] = {};
+  TypeParam dst[16];
+  float mapx[16] = {};
+  float mapy[16] = {};
+  const uint64_t src_stride =
+      static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) + 1;
+
+  EXPECT_EQ(
+      KLEIDICV_ERROR_NOT_IMPLEMENTED,
+      kleidicv_remap_f32_u8(src, src_stride, 1, 1, dst, 16 * element_size, 16,
+                            1, 1, mapx, 16 * sizeof(float), mapy,
+                            16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR,
+                            KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+}
+
+TYPED_TEST(RemapF32, UnsupportedBigSourceWidth) {
+  const size_t element_size = sizeof(TypeParam);
+  const TypeParam src[1] = {};
+  TypeParam dst[16];
+  float mapx[16] = {};
+  float mapy[16] = {};
+
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            kleidicv_remap_f32_u8(src, element_size, 1ULL << 24, 1, dst,
+                                  16 * element_size, 16, 1, 1, mapx,
+                                  16 * sizeof(float), mapy, 16 * sizeof(float),
+                                  KLEIDICV_INTERPOLATION_LINEAR,
+                                  KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            kleidicv_remap_f32_u8(src, element_size, (1ULL << 32) + 1, 1, dst,
+                                  16 * element_size, 16, 1, 1, mapx,
+                                  16 * sizeof(float), mapy, 16 * sizeof(float),
+                                  KLEIDICV_INTERPOLATION_LINEAR,
+                                  KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+}
+
+TYPED_TEST(RemapF32, UnsupportedBigSourceHeight) {
+  const size_t element_size = sizeof(TypeParam);
+  const TypeParam src[1] = {};
+  TypeParam dst[16];
+  float mapx[16] = {};
+  float mapy[16] = {};
+
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            kleidicv_remap_f32_u8(src, element_size, 1, 1ULL << 24, dst,
+                                  16 * element_size, 16, 1, 1, mapx,
+                                  16 * sizeof(float), mapy, 16 * sizeof(float),
+                                  KLEIDICV_INTERPOLATION_LINEAR,
+                                  KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            kleidicv_remap_f32_u8(src, element_size, 1, (1ULL << 32) + 1, dst,
+                                  16 * element_size, 16, 1, 1, mapx,
+                                  16 * sizeof(float), mapy, 16 * sizeof(float),
+                                  KLEIDICV_INTERPOLATION_LINEAR,
+                                  KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+}
+
+TYPED_TEST(RemapF32, UnsupportedBigDestinationWidth) {
+  const size_t element_size = sizeof(TypeParam);
+  const TypeParam src[1] = {};
+  TypeParam dst[16];
+  float mapx[16] = {};
+  float mapy[16] = {};
+
+  EXPECT_EQ(
+      KLEIDICV_ERROR_RANGE,
+      kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size,
+                            1ULL << 24, 1, 1, mapx, 16 * sizeof(float), mapy,
+                            16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR,
+                            KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+}
+
+TYPED_TEST(RemapF32, UnsupportedBigDestinationHeight) {
+  const size_t element_size = sizeof(TypeParam);
+  const TypeParam src[1] = {};
+  TypeParam dst[16];
+  float mapx[16] = {};
+  float mapy[16] = {};
+
+  EXPECT_EQ(
+      KLEIDICV_ERROR_RANGE,
+      kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 16,
+                            1ULL << 24, 1, mapx, 16 * sizeof(float), mapy,
+                            16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR,
+                            KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+}
+
+TYPED_TEST(RemapF32, Misalignment) {
+  const size_t element_size = sizeof(TypeParam);
+  if (element_size == 1) {
+    // misalignment impossible
+    GTEST_SKIP();
+  }
+
+  // Will be needed when supporting uint16_t
+}
diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp
index 3861d8028065cc2b26b6837fc1c0b0c95ee3f398..c930b4757678e9e452636c4bf9731bcab6bbc29f 100644
--- a/test/api/test_thread.cpp
+++ b/test/api/test_thread.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -244,6 +244,80 @@ class Thread : public testing::TestWithParam<P> {
     EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, result);
   }
 
+  template <typename T, typename SingleThreadedFunc, typename MultithreadedFunc,
+            typename... Args>
+  void check_remap_f32(SingleThreadedFunc single_threaded_func,
+                       MultithreadedFunc multithreaded_func, size_t channels,
+                       Args... args) {
+    unsigned test_width = 0, height = 0, thread_count = 0;
+    std::tie(test_width, height, thread_count) = GetParam();
+    const unsigned src_width = 300, src_height = 300;
+    // width < 4 are not supported, that's not tested here
+    size_t width = test_width + 4;
+    test::Array2D<T> src(size_t{src_width} * channels, src_height);
+    test::Array2D<float> mapx(width * 2, height);
+    test::Array2D<float> mapy(width, height);
+    test::Array2D<T> dst_single(width * channels, height),
+        dst_multi(width * channels, height);
+
+    test::PseudoRandomNumberGenerator<T> src_generator;
+    src.fill(src_generator);
+    test::PseudoRandomNumberGeneratorFloatRange<float> xcoord_generator{
+        0, std::min(static_cast<float>(src_height - 1),
+                    static_cast<float>(src_width - 1))};
+    mapx.fill(xcoord_generator);
+    test::PseudoRandomNumberGeneratorFloatRange<float> ycoord_generator{
+        0, std::min(static_cast<float>(src_height - 1),
+                    static_cast<float>(src_width - 1))};
+    mapy.fill(ycoord_generator);
+
+    kleidicv_error_t single_result = single_threaded_func(
+        src.data(), src.stride(), src_width, src_height, dst_single.data(),
+        dst_single.stride(), width, height, channels, mapx.data(),
+        mapx.stride(), mapy.data(), mapy.stride(), args...);
+
+    kleidicv_error_t multi_result = multithreaded_func(
+        src.data(), src.stride(), src_width, src_height, dst_multi.data(),
+        dst_multi.stride(), width, height, channels, mapx.data(), mapx.stride(),
+        mapy.data(), mapy.stride(), args...,
+        get_multithreading_fake(thread_count));
+
+    EXPECT_EQ(KLEIDICV_OK, single_result);
+    EXPECT_EQ(KLEIDICV_OK, multi_result);
+    EXPECT_EQ_ARRAY2D(dst_multi, dst_single);
+  }
+
+  template <typename T, typename MultithreadedFunc, typename... Args>
+  void check_remap_f32_not_implemented(MultithreadedFunc multithreaded_func,
+                                       size_t channels, Args... args) {
+    unsigned test_width = 0, height = 0, thread_count = 0;
+    std::tie(test_width, height, thread_count) = GetParam();
+    const unsigned src_width = 300, src_height = 300;
+    // width < 4 are not supported, that's not tested here
+    size_t width = test_width + 4;
+    test::Array2D<T> src(size_t{src_width} * channels, src_height);
+    test::Array2D<float> mapx(width * 2, height);
+    test::Array2D<float> mapy(width, height);
+    test::Array2D<T> dst_small(test_width * channels, height),
+        dst(width * channels, height);
+
+    kleidicv_error_t result = multithreaded_func(
+        src.data(), src.stride(), src_width, src_height, dst.data(),
+        dst.stride(), width, height, channels, mapx.data(), mapx.stride(),
+        mapy.data(), mapy.stride(), args...,
+        get_multithreading_fake(thread_count));
+
+    EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, result);
+
+    result = multithreaded_func(src.data(), src.stride(), src_width, src_height,
+                                dst_small.data(), dst_small.stride(),
+                                test_width, height, channels, mapx.data(),
+                                mapx.stride(), mapy.data(), mapy.stride(),
+                                args..., get_multithreading_fake(thread_count));
+
+    EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, result);
+  }
+
   template <typename T, typename SingleThreadedFunc, typename MultithreadedFunc,
             typename... Args>
   void check_warp_perspective(SingleThreadedFunc single_threaded_func,
@@ -688,6 +762,32 @@ TEST_P(Thread, remap_s16point5_u16_not_implemented) {
       &border_value);
 }
 
+TEST_P(Thread, remap_f32_u8_border_replicate) {
+  check_remap_f32<uint8_t>(kleidicv_remap_f32_u8, kleidicv_thread_remap_f32_u8,
+                           1, KLEIDICV_INTERPOLATION_LINEAR,
+                           KLEIDICV_BORDER_TYPE_REPLICATE, nullptr);
+}
+
+TEST_P(Thread, remap_f32_u8_border_constant) {
+  const uint8_t border_value = 0;
+  check_remap_f32<uint8_t>(kleidicv_remap_f32_u8, kleidicv_thread_remap_f32_u8,
+                           1, KLEIDICV_INTERPOLATION_LINEAR,
+                           KLEIDICV_BORDER_TYPE_CONSTANT, &border_value);
+}
+
+TEST_P(Thread, remap_f32_u8_not_implemented) {
+  const uint8_t border_value = 0;
+  check_remap_f32_not_implemented<uint8_t>(
+      kleidicv_thread_remap_f32_u8, 2, KLEIDICV_INTERPOLATION_LINEAR,
+      KLEIDICV_BORDER_TYPE_REPLICATE, &border_value);
+  check_remap_f32_not_implemented<uint8_t>(
+      kleidicv_thread_remap_f32_u8, 1, KLEIDICV_INTERPOLATION_NEAREST,
+      KLEIDICV_BORDER_TYPE_REPLICATE, &border_value);
+  check_remap_f32_not_implemented<uint8_t>(
+      kleidicv_thread_remap_f32_u8, 1, KLEIDICV_INTERPOLATION_LINEAR,
+      KLEIDICV_BORDER_TYPE_REFLECT, &border_value);
+}
+
 TEST_P(Thread, warp_perspective_u8_border_replicate) {
   const uint8_t border_value = 0;
   check_warp_perspective<uint8_t>(