From d2b365765bece0d8ad569ad4c31842620363d2d1 Mon Sep 17 00:00:00 2001
From: Denes Tarjan <denes.tarjan@arm.com>
Date: Thu, 19 Dec 2024 07:49:49 +0000
Subject: [PATCH] Implement WarpPerspective Linear Interpolation

---
 CHANGELOG.md                                  |   2 +
 adapters/opencv/kleidicv_hal.cpp              |  26 +-
 adapters/opencv/kleidicv_hal.h                |   5 -
 adapters/opencv/opencv-4.10.patch             |  33 +-
 adapters/opencv/opencv-5.x.patch              |  32 +-
 benchmark/benchmark.cpp                       |  37 +-
 conformity/opencv/test_warp_perspective.cpp   |  20 +-
 doc/functionality.md                          |   6 +
 doc/opencv.md                                 |  11 +
 kleidicv/CMakeLists.txt                       |   1 -
 kleidicv/include/kleidicv/config.h.in         |   2 -
 kleidicv/include/kleidicv/kleidicv.h          |   8 +-
 .../kleidicv/transform/warp_perspective.h     |  11 +-
 kleidicv/include/kleidicv/utils.h             |  29 +-
 .../src/transform/warp_perspective_api.cpp    |   6 +-
 .../src/transform/warp_perspective_neon.cpp   | 346 +++++++----
 kleidicv/src/transform/warp_perspective_sc.h  | 407 +++++++-----
 .../src/transform/warp_perspective_sve2.cpp   |   3 +-
 .../include/kleidicv_thread/kleidicv_thread.h |   1 +
 kleidicv_thread/src/kleidicv_thread.cpp       |   5 +-
 scripts/benchmark/benchmarks.txt              |   6 +-
 test/api/test_thread.cpp                      |  31 +-
 test/api/test_warp_perspective.cpp            | 585 ++++++++++++++++--
 test/framework/array.h                        |  47 +-
 24 files changed, 1218 insertions(+), 442 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d96fc0663..0e8807ac0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,8 @@ This changelog aims to follow the guiding principles of
 ### Added
 - Implementation of Rotate 90 degrees clockwise.
 - Remap function with nearest neighbour and fixed-point interpolation for 8-bit inputs.
+- WarpPerspective implementation
+  - Nearest and Linear interpolation method, for replicated borders and 1-channel u8 input.
 
 ### Changed
 - Increased precision of sum for 32 bit floats and expose it to OpenCV HAL.
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index 0a6aa9c6e..4c57b0857 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -353,7 +353,6 @@ static int from_opencv(int opencv_border_type,
   return 0;
 }
 
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 // Converts an OpenCV interpolation type to a KleidiCV interpolation type.
 static int from_opencv(int opencv_interpolation,
                        kleidicv_interpolation_type_t &interpolation_type) {
@@ -361,19 +360,15 @@ static int from_opencv(int opencv_interpolation,
     default:
       return 1;
     case CV_HAL_INTER_NEAREST:
-      interpolation_type =
-          kleidicv_interpolation_type_t::KLEIDICV_INTERPOLATION_NEAREST;
+      interpolation_type = KLEIDICV_INTERPOLATION_NEAREST;
       break;
     case CV_HAL_INTER_LINEAR:
-      interpolation_type =
-          kleidicv_interpolation_type_t::KLEIDICV_INTERPOLATION_LINEAR;
+      interpolation_type = KLEIDICV_INTERPOLATION_LINEAR;
       break;
   }
 
   return 0;
 }
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
-
 struct SeparableFilter2DParams {
   size_t channels;
   kleidicv_border_type_t border_type;
@@ -1421,24 +1416,23 @@ int scharr_deriv(const uchar *src_data, size_t src_step, int16_t *dst_data,
       src, src_step, width + 2, height + 2, cn, dst_data, dst_step, mt));
 }
 
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 int warp_perspective(int src_type, const uchar *src_data, size_t src_step,
                      int src_width, int src_height, uchar *dst_data,
                      size_t dst_step, int dst_width, int dst_height,
-                     const double transformation[9], int interpolation,
+                     const double transformation_f64[9], int interpolation,
                      int border_type, const double border_value_f64[4]) {
   kleidicv_border_type_t kleidicv_border_type;
   if (from_opencv(border_type, kleidicv_border_type)) {
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
   }
 
-  float float_transformation[9];
+  float transformation[9];
   for (size_t i = 0; i < 9; ++i) {
-    float_transformation[i] = static_cast<float>(transformation[i]);
+    transformation[i] = static_cast<float>(transformation_f64[i]);
   }
 
-  kleidicv_interpolation_type_t kleidicv_interpolation_type;
-  if (from_opencv(interpolation, kleidicv_interpolation_type)) {
+  kleidicv_interpolation_type_t kleidicv_interpolation;
+  if (from_opencv(interpolation, kleidicv_interpolation)) {
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
   }
 
@@ -1451,12 +1445,10 @@ int warp_perspective(int src_type, const uchar *src_data, size_t src_step,
         src_data, src_step, static_cast<size_t>(src_width),
         static_cast<size_t>(src_height), dst_data, dst_step,
         static_cast<size_t>(dst_width), static_cast<size_t>(dst_height),
-        float_transformation, 1, kleidicv_interpolation_type,
-        kleidicv_border_type, border_value.data(), mt));
+        transformation, 1, kleidicv_interpolation, kleidicv_border_type,
+        border_value.data(), mt));
   }
 
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
-
 }  // namespace kleidicv::hal
diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h
index 18cc07e2a..2a5f04e31 100644
--- a/adapters/opencv/kleidicv_hal.h
+++ b/adapters/opencv/kleidicv_hal.h
@@ -158,13 +158,11 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step,
                     const uint16_t *mapfrac, size_t mapfrac_step,
                     int border_type, const double border_value[4]);
 
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 int warp_perspective(int src_type, const uchar *src_data, size_t src_step,
                      int src_width, int src_height, uchar *dst_data,
                      size_t dst_step, int dst_width, int dst_height,
                      const double transformation[9], int interpolation,
                      int borderType, const double borderValue[4]);
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 
 int scharr_deriv(const uchar *src_data, size_t src_step, int16_t *dst_data,
                  size_t dst_step, int width, int height, int cn);
@@ -432,7 +430,6 @@ static inline int kleidicv_pyrdown_with_fallback(
 #undef cv_hal_pyrdown
 #define cv_hal_pyrdown kleidicv_pyrdown_with_fallback
 
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 // warp_perspective
 static inline int kleidicv_warp_perspective_with_fallback(
     int src_type, const uchar *src_data, size_t src_step, int src_width,
@@ -448,8 +445,6 @@ static inline int kleidicv_warp_perspective_with_fallback(
 #undef cv_hal_warpPerspective
 #define cv_hal_warpPerspective kleidicv_warp_perspective_with_fallback
 
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
-
 #endif  // OPENCV_IMGPROC_HAL_REPLACEMENT_HPP
 
 #ifdef OPENCV_CORE_HAL_REPLACEMENT_HPP
diff --git a/adapters/opencv/opencv-4.10.patch b/adapters/opencv/opencv-4.10.patch
index 5283cbb8f..85599737b 100644
--- a/adapters/opencv/opencv-4.10.patch
+++ b/adapters/opencv/opencv-4.10.patch
@@ -19,7 +19,7 @@ index 2b4035285f..729cd1dd43 100644
 @@ -281,6 +281,11 @@ void Mat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
      dst.create(dims, size, dtype);
      Mat dstMat = dst.getMat();
- 
+
 +    if( dims <= 2 ) {
 +        int width_in_elements = src.cols * cn;
 +        CALL_HAL(convertTo, cv_hal_convertTo, src.data, src.step, src.depth(), dstMat.data, dstMat.step, dstMat.depth(), width_in_elements, src.rows, alpha, beta);
@@ -35,7 +35,7 @@ index f78608dbad..a9384588ec 100644
 @@ -953,6 +953,53 @@ inline int hal_ni_transpose2d(const uchar* src_data, size_t src_step, uchar* dst
  #define cv_hal_transpose2d hal_ni_transpose2d
  //! @endcond
- 
+
 +/**
 +   @brief sum
 +   @param src_data,src_step,src_type Source image
@@ -84,8 +84,8 @@ index f78608dbad..a9384588ec 100644
 +//! @endcond
 +
  //! @}
- 
- 
+
+
 diff --git a/modules/core/src/minmax.cpp b/modules/core/src/minmax.cpp
 index 8c6d8ad9a9..47eb6fdb66 100644
 --- a/modules/core/src/minmax.cpp
@@ -124,7 +124,7 @@ index 773fed9b48..b74ff70f99 100644
 @@ -328,6 +328,60 @@ inline int hal_ni_remap32f(int src_type, const uchar *src_data, size_t src_step,
  #define cv_hal_remap32f hal_ni_remap32f
  //! @endcond
- 
+
 +/**
 +   @brief hal_remap with a short integer map
 +   @param src_type source and destination image type
@@ -199,7 +199,7 @@ index d7c9c64c3c..348208b72d 100644
 +        CALL_HAL(remap16s16u, cv_hal_remap16s16u, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
 +                 map1.ptr<short>(), map1.step, map2.ptr<unsigned short>(), map2.step, borderType, borderValue.val);
      }
- 
+
      interpolation &= ~WARP_RELATIVE_MAP;
 diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp
 index d0f50a73bb..1c308887dc 100644
@@ -208,7 +208,7 @@ index d0f50a73bb..1c308887dc 100644
 @@ -654,6 +654,25 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
              ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)
      );
- 
+
 +    {
 +        Mat src = _src.getMat();
 +        Mat dst = _dst.getMat();
@@ -316,12 +316,12 @@ index 6d51c0cf1a..0e6f6a324e 100644
 +++ b/modules/video/src/lkpyramid.cpp
 @@ -50,6 +50,7 @@
  #endif
- 
+
  #include "opencv2/core/openvx/ovx_defs.hpp"
 +#include "hal_replacement.hpp"
- 
+
  #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
- 
+
 @@ -62,6 +63,9 @@ static void calcScharrDeriv(const cv::Mat& src, cv::Mat& dst)
      int rows = src.rows, cols = src.cols, cn = src.channels(), depth = src.depth();
      CV_Assert(depth == CV_8U);
@@ -331,4 +331,15 @@ index 6d51c0cf1a..0e6f6a324e 100644
 +
      parallel_for_(Range(0, rows), cv::detail::ScharrDerivInvoker(src, dst), cv::getNumThreads());
  }
- 
+
+diff --git a/modules/imgproc/test/test_imgwarp_strict.cpp b/modules/imgproc/test/test_imgwarp_strict.cpp
+index 83a5e23781..75df962fb7 100644
+--- a/modules/imgproc/test/test_imgwarp_strict.cpp
++++ b/modules/imgproc/test/test_imgwarp_strict.cpp
+@@ -239,7 +239,7 @@ float CV_ImageWarpBaseTest::get_success_error_level(int _interpolation, int) con
+     else if (_interpolation == INTER_AREA)
+         return 2.0f;
+     else
+-        return 1.0f;
++        return 4.0f;    // Reference algorithm uses integer interpolation with 5 bits fractional part, and this greater tolerance allows better precision algorithms to pass the test
+ }
diff --git a/adapters/opencv/opencv-5.x.patch b/adapters/opencv/opencv-5.x.patch
index cc7f77c75..3f0298d27 100644
--- a/adapters/opencv/opencv-5.x.patch
+++ b/adapters/opencv/opencv-5.x.patch
@@ -27,7 +27,7 @@ index 56767f2ec7..5ade43b063 100644
 @@ -949,6 +951,13 @@ if(NOT DEFINED OpenCV_HAL)
    set(OpenCV_HAL "OpenCV_HAL")
  endif()
- 
+
 +if(WITH_KLEIDICV)
 +  ocv_debug_message(STATUS "Enable KleidiCV acceleration")
 +  if(NOT ";${OpenCV_HAL};" MATCHES ";kleidicv;")
@@ -55,7 +55,7 @@ index 3c61ae1c9a..f44e7d8dc2 100644
 +++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
 @@ -621,6 +621,7 @@ Following build options are utilized in `opencv_contrib` modules, as stated [pre
  `CMAKE_TOOLCHAIN_FILE`
- 
+
  `WITH_CAROTENE`
 +`WITH_KLEIDICV`
  `WITH_CPUFEATURES`
@@ -66,14 +66,14 @@ index c7445a4de4..49a459597b 100644
 --- a/modules/core/include/opencv2/core/hal/interface.h
 +++ b/modules/core/include/opencv2/core/hal/interface.h
 @@ -90,6 +90,8 @@ typedef short cv_hal_bf16;
- 
+
  #define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
  #define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
 +#define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
 +#define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
  #define CV_IS_INT_TYPE(flags)   (((1 << CV_MAT_DEPTH(flags)) & 0x1e1f) != 0)
  #define CV_IS_FLOAT_TYPE(flags) (((1 << CV_MAT_DEPTH(flags)) & 0x1e0) != 0)
- 
+
 diff --git a/modules/core/src/convert.dispatch.cpp b/modules/core/src/convert.dispatch.cpp
 index 4a8432fb0e..82cecf90c0 100644
 --- a/modules/core/src/convert.dispatch.cpp
@@ -81,7 +81,7 @@ index 4a8432fb0e..82cecf90c0 100644
 @@ -195,6 +195,11 @@ void Mat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
      dst.create( dims, size, dtype, -1, allowTransposed );
      Mat dstMat = dst.getMat();
- 
+
 +    if( dims <= 2 ) {
 +        int width_in_elements = src.cols * cn;
 +        CALL_HAL(convertTo, cv_hal_convertTo, src.data, src.step, src.depth(), dstMat.data, dstMat.step, dstMat.depth(), width_in_elements, src.rows, alpha, beta);
@@ -97,7 +97,7 @@ index dc65a25684..5be57f0923 100644
 @@ -968,6 +968,21 @@ inline int hal_ni_transpose2d(const uchar* src_data, size_t src_step, uchar* dst
  #define cv_hal_transpose2d hal_ni_transpose2d
  //! @endcond
- 
+
 +/**
 +   @brief convertTo
 +   @param src_data,src_step,src_depth Source image
@@ -114,8 +114,8 @@ index dc65a25684..5be57f0923 100644
 +//! @endcond
 +
  //! @}
- 
- 
+
+
 diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp
 index 5c6497bd80..43e89d5b9c 100644
 --- a/modules/imgproc/src/hal_replacement.hpp
@@ -123,7 +123,7 @@ index 5c6497bd80..43e89d5b9c 100644
 @@ -880,6 +880,12 @@ inline int hal_ni_gaussianBlur(const uchar* src_data, size_t src_step, uchar* ds
  #define cv_hal_gaussianBlur hal_ni_gaussianBlur
  //! @endcond
- 
+
 +inline int hal_ni_gaussianBlurBinomial(const uchar*, size_t, uchar*, size_t, int, int, int, int, size_t, size_t, size_t, size_t, size_t, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 +
 +//! @cond IGNORED
@@ -140,7 +140,7 @@ index 3e23187de2..45737b1ee3 100644
 @@ -570,6 +570,20 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
              ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)
      );
- 
+
 +    if (sigma1 == 0.0 && sigma2 == 0.0 && ksize.height == ksize.width)
 +    {
 +        Mat src = _src.getMat();
@@ -158,3 +158,15 @@ index 3e23187de2..45737b1ee3 100644
      if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.isSubmatrix()))
      {
          std::vector<ufixedpoint16> fkx, fky;
+
+diff --git a/modules/imgproc/test/test_imgwarp_strict.cpp b/modules/imgproc/test/test_imgwarp_strict.cpp
+index 83a5e23781..75df962fb7 100644
+--- a/modules/imgproc/test/test_imgwarp_strict.cpp
++++ b/modules/imgproc/test/test_imgwarp_strict.cpp
+@@ -239,7 +239,7 @@ float CV_ImageWarpBaseTest::get_success_error_level(int _interpolation, int) con
+     else if (_interpolation == INTER_AREA)
+         return 2.0f;
+     else
+-        return 1.0f;
++        return 4.0f;    // Reference algorithm uses integer interpolation with 5 bits fractional part, and this greater tolerance allows better precision algorithms to pass the test
+ }
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index bf178d1f5..94cff9f28 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -677,7 +677,6 @@ BENCH_REMAP_S16POINT5(remap_s16point5_u8_identity, remap_s16point5_u8,
                       get_identity_mapxy<int16_t>, 1,
                       KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
 
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 // clang-format off
 static const float transform_identity[] = {
   1.0, 0, 0,
@@ -735,23 +734,45 @@ static void warp_perspective(Function f, const float transform[9],
   }                                                                  \
   BENCHMARK(benchname)
 
-BENCH_WARP_PERSPECTIVE(warp_perspective_u8_identity, warp_perspective_u8,
-                       transform_identity, 1, KLEIDICV_INTERPOLATION_NEAREST,
+BENCH_WARP_PERSPECTIVE(warp_perspective_u8_nearest_identity,
+                       warp_perspective_u8, transform_identity, 1,
+                       KLEIDICV_INTERPOLATION_NEAREST,
                        KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
 
-BENCH_WARP_PERSPECTIVE(warp_perspective_u8_small, warp_perspective_u8,
+BENCH_WARP_PERSPECTIVE(warp_perspective_u8_nearest_small, warp_perspective_u8,
                        transform_small, 1, KLEIDICV_INTERPOLATION_NEAREST,
                        KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
 
-BENCH_WARP_PERSPECTIVE(warp_perspective_u8_bend, warp_perspective_u8,
+BENCH_WARP_PERSPECTIVE(warp_perspective_u8_nearest_bend, warp_perspective_u8,
                        transform_bend, 1, KLEIDICV_INTERPOLATION_NEAREST,
                        KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
 
-BENCH_WARP_PERSPECTIVE(warp_perspective_u8_rotate, warp_perspective_u8,
+BENCH_WARP_PERSPECTIVE(warp_perspective_u8_nearest_rotate, warp_perspective_u8,
                        transform_rotate, 1, KLEIDICV_INTERPOLATION_NEAREST,
                        KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
 
-BENCH_WARP_PERSPECTIVE(warp_perspective_u8_near, warp_perspective_u8,
+BENCH_WARP_PERSPECTIVE(warp_perspective_u8_nearest_near, warp_perspective_u8,
                        transform_near, 1, KLEIDICV_INTERPOLATION_NEAREST,
                        KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
+
+// WarpPerspective Linear
+
+BENCH_WARP_PERSPECTIVE(warp_perspective_u8_linear_identity, warp_perspective_u8,
+                       transform_identity, 1, KLEIDICV_INTERPOLATION_LINEAR,
+                       KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
+
+BENCH_WARP_PERSPECTIVE(warp_perspective_u8_linear_small, warp_perspective_u8,
+                       transform_small, 1, KLEIDICV_INTERPOLATION_LINEAR,
+                       KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
+
+BENCH_WARP_PERSPECTIVE(warp_perspective_u8_linear_bend, warp_perspective_u8,
+                       transform_bend, 1, KLEIDICV_INTERPOLATION_LINEAR,
+                       KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
+
+BENCH_WARP_PERSPECTIVE(warp_perspective_u8_linear_rotate, warp_perspective_u8,
+                       transform_rotate, 1, KLEIDICV_INTERPOLATION_LINEAR,
+                       KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
+
+BENCH_WARP_PERSPECTIVE(warp_perspective_u8_linear_near, warp_perspective_u8,
+                       transform_near, 1, KLEIDICV_INTERPOLATION_LINEAR,
+                       KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t);
diff --git a/conformity/opencv/test_warp_perspective.cpp b/conformity/opencv/test_warp_perspective.cpp
index 775e3d435..af57e1b1d 100644
--- a/conformity/opencv/test_warp_perspective.cpp
+++ b/conformity/opencv/test_warp_perspective.cpp
@@ -30,7 +30,7 @@ cv::Mat exec_warp_perspective(cv::Mat& source_mat) {
 }
 
 #if MANAGER
-const int kMaxHeight = 42, kMaxWidth = 42;
+const int kMaxHeight = 16, kMaxWidth = 65;
 
 template <class ScalarType, int Format, int Interpolation, int BorderMode,
           int BorderValue>
@@ -39,11 +39,20 @@ bool test_warp_perspective(int index, RecreatedMessageQueue& request_queue,
   for (size_t w = 8; w <= kMaxWidth; w += 3) {
     for (size_t h = 8; h <= kMaxHeight; h += 4) {
       cv::Mat source_mat(h, w, Format);
+      // Perspective calculation in float greatly amplifies any small errors
+      // coming from precision innaccuracies, let it be Fused-Multiply-Add or
+      // just the limitation of the single precision float. Taking the
+      // fractional part of a value in the thousands' range decreases the
+      // precision by 3 decimal digits, and the single precision float only has
+      // 7 decimal digits. Doing a linear interpolation between 0 and 255 would
+      // decrease it even more, but ensuring that neighbouring pixels have
+      // neighbouring values decreases this effect by more than 2 decimal digits
+      // so it can be expected that the error won't be bigger than 1.
       for (size_t row = 0; row < h; ++row) {
         for (size_t column = 0; column < w; ++column) {
+          const int kMaxVal = std::numeric_limits<ScalarType>::max();
           source_mat.at<ScalarType>(row, column) =
-              (row * (w + 12) + column) %
-              std::numeric_limits<ScalarType>::max();
+              abs(static_cast<int>(row + column) % (2 * kMaxVal + 1) - kMaxVal);
         }
       }
 
@@ -56,7 +65,7 @@ bool test_warp_perspective(int index, RecreatedMessageQueue& request_queue,
       bool success =
           !are_matrices_different<ScalarType>(1, actual_mat, expected_mat);
       if (!success) {
-        fail_print_matrices(w, h, source_mat, actual_mat, expected_mat);
+        fail_print_matrices(h, w, source_mat, actual_mat, expected_mat);
         return true;
       }
     }
@@ -68,7 +77,8 @@ bool test_warp_perspective(int index, RecreatedMessageQueue& request_queue,
 std::vector<test>& warp_perspective_tests_get() {
   // clang-format off
   static std::vector<test> tests = {
-    TEST("WarpPerspective uint8", (test_warp_perspective<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_warp_perspective<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
+    TEST("WarpPerspectiveNearest uint8", (test_warp_perspective<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_warp_perspective<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
+    TEST("WarpPerspectiveLinear uint8", (test_warp_perspective<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>), (exec_warp_perspective<uint8_t, CV_8UC1, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 0>)),
   };
   // clang-format on
   return tests;
diff --git a/doc/functionality.md b/doc/functionality.md
index 71f4860d8..3271c47ff 100644
--- a/doc/functionality.md
+++ b/doc/functionality.md
@@ -97,3 +97,9 @@ See `doc/opencv.md` for details of the functionality available in OpenCV.
 |--------------------------------------------|-----|-----|
 | Remap int16 coordinates                    |  x  |     |
 | Remap int16+uint16 fixed-point coordinates |  x  |     |
+
+# WarpPerspective
+|                                          |  u8 | u16 |
+|------------------------------------------|-----|-----|
+| Nearest neighbour, replicated borders    |  x  |     |
+| Linear interpolation, replicated borders |  x  |     |
diff --git a/doc/opencv.md b/doc/opencv.md
index 2c04dc4ec..bbc886fff 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -218,6 +218,17 @@ Supported map configurations:
 * `map1` is 16SC2 and `map2` is 16UC1: `map1` is as above, `map2` contains combined 5+5 bits of x (low) and y (high) fractions, i.e. x = x1 + x2 / 2^5
   * supported `interpolation`: `INTER_LINEAR` only
 
+### [`cv::warpPerspective()`](https://docs.opencv.org/4.10.0/da/d54/group__imgproc__transform.html#gaf73673a7e8e18ec6963e3774e6a94b87)
+Performs a perspective transformation on an image.
+
+Notes on parameters:
+* `src.depth()` - only supports `CV_8U` depth and 1 channel.
+* `src.cols`, `src.rows`, `dst.cols`, `dst.rows` - must be less than 2^24
+* `src.step` - must be less than 2^32
+* `dst.cols` - must be at least 8
+* `borderMode` - only supports `BORDER_REPLICATE`
+* `interpolation` - supports `INTER_NEAREST` and `INTER_LINEAR`
+
 ### [`cv::pyrDown()`](https://docs.opencv.org/4.10.0/d4/d86/group__imgproc__filter.html#gaf9bba239dfca11654cb7f50f889fc2ff)
 Blurs and downsamples an image.
 
diff --git a/kleidicv/CMakeLists.txt b/kleidicv/CMakeLists.txt
index 1412394ab..0b0fa2c81 100644
--- a/kleidicv/CMakeLists.txt
+++ b/kleidicv/CMakeLists.txt
@@ -50,7 +50,6 @@ option(KLEIDICV_ASSUME_128BIT_SVE2 "Internal - If turned ON 128-bit SVE2 vector
 option(KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE "Internal - If turned ON interleaving loads and stores are preferred instead of continuous loads and stores" OFF)
 option(KLEIDICV_EXPERIMENTAL_FEATURE_CANNY "Internal - Enable experimental Canny algorithm" OFF)
 option(KLEIDICV_CANNY_ALGORITHM_CONFORM_OPENCV "Internal - If turned ON Canny algorithm creates bit exact result compared to OpenCV's original implementation" ON)
-option(KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE "Internal - Enable experimental WarpPerspective algorithm" OFF)
 
 if(KLEIDICV_ENABLE_SME2 AND NOT KLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS)
   set(KLEIDICV_ALWAYS_ENABLE_SME2 ON)
diff --git a/kleidicv/include/kleidicv/config.h.in b/kleidicv/include/kleidicv/config.h.in
index 3e28a4535..18a0970f3 100644
--- a/kleidicv/include/kleidicv/config.h.in
+++ b/kleidicv/include/kleidicv/config.h.in
@@ -21,8 +21,6 @@
 
 #cmakedefine01 KLEIDICV_CANNY_ALGORITHM_CONFORM_OPENCV
 
-#cmakedefine01 KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
-
 // Set to '1' if compiling NEON code paths, otherwise it is set to '0'.
 #ifndef KLEIDICV_TARGET_NEON
 #define KLEIDICV_TARGET_NEON 0
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index bd4e38801..3c95ef4c3 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1870,7 +1870,6 @@ kleidicv_error_t kleidicv_scharr_interleaved_s16_u8(
     size_t src_channels, int16_t *dst, size_t dst_stride);
 #endif  // DOXYGEN
 
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 /// Performs a perspective transformation on an image.
 /// For each pixel in `dst` take a pixel from `src` specified by
 /// the transformed x and y coordinates, and optionally doing a bilinear
@@ -1894,12 +1893,14 @@ kleidicv_error_t kleidicv_scharr_interleaved_s16_u8(
 ///                       Must be a multiple of `sizeof(type)` and no less than
 ///                       `width * sizeof(type)`, except for single-row images.
 /// @param dst_width      Number of elements in the destination row. Must be at
-///                       least 8.
-/// @param dst_height     Number of rows in the destination data.
+///                       least 8. Must be less than 2^24.
+/// @param dst_height     Number of rows in the destination data. Must be less
+///                       than 2^24.
 /// @param transformation Pointer to the transformation matrix of 9 values.
 /// @param channels       Number of channels in the data. Must be 1.
 /// @param interpolation  Interpolation algorithm. Supported types: \n
 ///                         - @ref KLEIDICV_INTERPOLATION_NEAREST
+///                         - @ref KLEIDICV_INTERPOLATION_LINEAR
 /// @param border_type    Way of handling the border. The supported border types
 ///                       are: \n
 ///                         - @ref KLEIDICV_BORDER_TYPE_REPLICATE
@@ -1911,7 +1912,6 @@ kleidicv_error_t kleidicv_warp_perspective_u8(
     const float transformation[9], size_t channels,
     kleidicv_interpolation_type_t interpolation,
     kleidicv_border_type_t border_type, const uint8_t *border_value);
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/kleidicv/include/kleidicv/transform/warp_perspective.h b/kleidicv/include/kleidicv/transform/warp_perspective.h
index e5fe74526..45b512764 100644
--- a/kleidicv/include/kleidicv/transform/warp_perspective.h
+++ b/kleidicv/include/kleidicv/transform/warp_perspective.h
@@ -24,18 +24,17 @@ KLEIDICV_API_DECLARATION(kleidicv_warp_perspective_stripe_u8,
                          kleidicv_border_type_t border_type,
                          const uint8_t *border_value);
 }
-
 namespace kleidicv {
 
 template <typename T>
 inline bool warp_perspective_is_implemented(
-    size_t dst_width, kleidicv_interpolation_type_t interpolation,
-    kleidicv_border_type_t border_type,
-    size_t channels) KLEIDICV_STREAMING_COMPATIBLE {
+    size_t dst_width, size_t channels,
+    kleidicv_interpolation_type_t interpolation,
+    kleidicv_border_type_t border_type) KLEIDICV_STREAMING_COMPATIBLE {
   if constexpr (std::is_same<T, uint8_t>::value) {
     return (dst_width >= 8 &&
-            interpolation ==
-                kleidicv_interpolation_type_t::KLEIDICV_INTERPOLATION_NEAREST &&
+            (interpolation == KLEIDICV_INTERPOLATION_NEAREST ||
+             interpolation == KLEIDICV_INTERPOLATION_LINEAR) &&
             border_type ==
                 kleidicv_border_type_t::KLEIDICV_BORDER_TYPE_REPLICATE &&
             channels == 1);
diff --git a/kleidicv/include/kleidicv/utils.h b/kleidicv/include/kleidicv/utils.h
index e327cf938..e95ce94ea 100644
--- a/kleidicv/include/kleidicv/utils.h
+++ b/kleidicv/include/kleidicv/utils.h
@@ -301,16 +301,27 @@ class LoopUnroll2 final {
     const size_t n_step = UnrollFactor * step();
     size_t max_index = index_ + (remaining_length() / n_step) * n_step;
 
+    // A tail mechanism is built into the single vector processing loop, if
+    // enabled. The single vector path is executed iteratively, and at the end
+    // it rewinds the loop to one vector before the end of the data, and
+    // executes one final vector path, so the scalar path can be omitted.
     if constexpr (try_to_avoid_tail_loop<Tail> && (UnrollFactor == 1)) {
-      while (index_ < max_index) {
-        while (index_ < max_index) {
-          callback(index_);
-          index_ += n_step;
-        }
-
-        if (remaining_length()) {
-          index_ = length_ - n_step;
-          max_index = length_;
+      // Enter this loop only if there's enough data for a full vector
+      if (length_ >= n_step) {
+        // External loop only ends when all data has been processed
+        while (index_ < length_) {
+          // Internal loop checks if the vector path can be executed
+          while (index_ < max_index) {
+            callback(index_);
+            index_ += n_step;
+          }
+          // Check if a final iteration is needed. The double loop is needed to
+          // avoid the repetition of the callback function, which is usually
+          // inlined into the binary. (Save some code space)
+          if (remaining_length()) {
+            index_ = length_ - n_step;
+            max_index = length_;
+          }
         }
       }
     } else {
diff --git a/kleidicv/src/transform/warp_perspective_api.cpp b/kleidicv/src/transform/warp_perspective_api.cpp
index 797b603c4..16b80e121 100644
--- a/kleidicv/src/transform/warp_perspective_api.cpp
+++ b/kleidicv/src/transform/warp_perspective_api.cpp
@@ -2,12 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include "kleidicv/ctypes.h"
 #include "kleidicv/dispatch.h"
 #include "kleidicv/kleidicv.h"
 #include "kleidicv/transform/warp_perspective.h"
 
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
-
 KLEIDICV_MULTIVERSION_C_API(kleidicv_warp_perspective_stripe_u8,
                             &kleidicv::neon::warp_perspective_stripe<uint8_t>,
                             &kleidicv::sve2::warp_perspective_stripe<uint8_t>,
@@ -22,7 +21,7 @@ kleidicv_error_t kleidicv_warp_perspective_u8(
     kleidicv_interpolation_type_t interpolation,
     kleidicv_border_type_t border_type, const uint8_t *border_value) {
   if (!kleidicv::warp_perspective_is_implemented<uint8_t>(
-          dst_width, interpolation, border_type, channels)) {
+          dst_width, channels, interpolation, border_type)) {
     return KLEIDICV_ERROR_NOT_IMPLEMENTED;
   }
 
@@ -33,4 +32,3 @@ kleidicv_error_t kleidicv_warp_perspective_u8(
 }
 
 }  // extern "C"
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
diff --git a/kleidicv/src/transform/warp_perspective_neon.cpp b/kleidicv/src/transform/warp_perspective_neon.cpp
index 7c84ee2d4..bba3f7e82 100644
--- a/kleidicv/src/transform/warp_perspective_neon.cpp
+++ b/kleidicv/src/transform/warp_perspective_neon.cpp
@@ -2,13 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include <arm_neon.h>
+
 #include "kleidicv/ctypes.h"
 #include "kleidicv/kleidicv.h"
 #include "kleidicv/neon.h"
 #include "kleidicv/traits.h"
 #include "kleidicv/transform/warp_perspective.h"
-
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
+#include "kleidicv/utils.h"
 
 namespace kleidicv::neon {
 
@@ -29,112 +30,207 @@ namespace kleidicv::neon {
 //      xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8)
 //      yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8)
 //
-template <typename ScalarType, bool IsLarge>
-class WarpPerspective;
-
-template <bool IsLarge>
-class WarpPerspective<uint8_t, IsLarge> {
- public:
-  using ScalarType = uint8_t;
-  using CoordVecTraits = VecTraits<float>;
-  using CoordVector = CoordVecTraits::VectorType;
-
-  WarpPerspective(Rows<const ScalarType> src_rows, size_t src_width,
-                  size_t src_height)
-      : src_rows_{src_rows},
-        src_height_{src_height},
-        v_src_stride_{vdup_n_u32(static_cast<uint32_t>(src_rows_.stride()))},
-        vq_src_stride_{vdupq_n_u32(static_cast<uint32_t>(src_rows_.stride()))},
-        x0123_{vld1q(first_few_x)},
-        v_xmax_{vdupq_n_f32(static_cast<float>(src_width - 1))},
-        v_ymax_{vdupq_n_f32(static_cast<float>(src_height - 1))} {}
-
-  void process_row(size_t y, size_t width, const float transform[9],
-                   Columns<ScalarType> dst) {
+
+typedef struct {
+  float32x4_t x, y;
+} CoordVectorPair;
+
+template <typename ScalarType, kleidicv_interpolation_type_t Inter,
+          bool IsLarge>
+void warp_perspective_operation(Rows<const ScalarType> src_rows,
+                                size_t src_width, size_t src_height,
+                                const float transform[9],
+                                Rows<ScalarType> dst_rows, size_t dst_width,
+                                size_t y_begin, size_t y_end) {
+  static constexpr uint32_t first_few_x[] = {0, 1, 2, 3};
+  uint32x4_t x0123_ = vld1q_u32(first_few_x);
+
+  uint32x2_t v_src_stride =
+      vdup_n_u32(static_cast<uint32_t>(src_rows.stride()));
+  uint32x4_t vq_src_stride =
+      vdupq_n_u32(static_cast<uint32_t>(src_rows.stride()));
+  uint32x4_t v_xmax = vdupq_n_u32(static_cast<uint32_t>(src_width - 1));
+  uint32x4_t v_ymax = vdupq_n_u32(static_cast<uint32_t>(src_height - 1));
+
+  float32x4_t tx0, ty0, tw0;
+
+  auto calculate_coordinates = [&](uint32_t x) {
+    // The next few values can be calculated by adding the corresponding Tn*x
+    float32x4_t fx = vcvtq_f32_u32(vaddq_u32(x0123_, vdupq_n_u32(x)));
+    float32x4_t tx = vmlaq_n_f32(tx0, fx, transform[0]);
+    float32x4_t ty = vmlaq_n_f32(ty0, fx, transform[3]);
+    float32x4_t tw = vmlaq_n_f32(tw0, fx, transform[6]);
+
+    // Calculate inverse weight because division is expensive
+    float32x4_t iw = vdivq_f32(vdupq_n_f32(1.F), tw);
+    // Calculate coordinates into the source image
+    float32x4_t xf = vmulq_f32(tx, iw);
+    float32x4_t yf = vmulq_f32(ty, iw);
+    return CoordVectorPair{xf, yf};
+  };
+
+  auto vector_path_nearest_small = [&](uint32_t x, Columns<ScalarType> dst) {
+    auto &&[xf, yf] = calculate_coordinates(x);
+    // Take the integer part, clamp it to within the dimensions of the
+    // source image (negative values are already saturated to 0)
+    uint32x4_t xi = vminq_u32(v_xmax, vcvtaq_u32_f32(xf));
+    uint32x4_t yi = vminq_u32(v_ymax, vcvtaq_u32_f32(yf));
+    // Calculate offsets from coordinates (y * stride + x)
+    // Use this path only when the final offsets fit into 32 bits
+    uint32x4_t offsets = vmlaq_u32(xi, yi, vq_src_stride);
+    // Copy pixels from source
+    ptrdiff_t ix = static_cast<ptrdiff_t>(x);
+    dst[ix] = src_rows[vgetq_lane_u32(offsets, 0)];
+    dst[ix + 1] = src_rows[vgetq_lane_u32(offsets, 1)];
+    dst[ix + 2] = src_rows[vgetq_lane_u32(offsets, 2)];
+    dst[ix + 3] = src_rows[vgetq_lane_u32(offsets, 3)];
+  };
+
+  auto vector_path_nearest_large = [&](uint32_t x, Columns<ScalarType> dst) {
+    auto &&[xf, yf] = calculate_coordinates(x);
+    // Take the integer part, clamp it to within the dimensions of the
+    // source image (negative values are already saturated to 0)
+    uint32x4_t xi = vminq_u32(v_xmax, vcvtaq_u32_f32(xf));
+    uint32x4_t yi = vminq_u32(v_ymax, vcvtaq_u32_f32(yf));
+    // Calculate offsets from coordinates (y * stride + x)
+    // To avoid losing precision, the final offsets should be in 64 bits
+    uint64x2_t offsets_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(xi)), vget_low_u32(yi), v_src_stride);
+    uint64x2_t offsets_high =
+        vmlal_u32(vmovl_high_u32(xi), vget_high_u32(yi), v_src_stride);
+    // Copy pixels from source
+    ptrdiff_t ix = static_cast<ptrdiff_t>(x);
+    dst[ix] = src_rows[vgetq_lane_u64(offsets_low, 0)];
+    dst[ix + 1] = src_rows[vgetq_lane_u64(offsets_low, 1)];
+    dst[ix + 2] = src_rows[vgetq_lane_u64(offsets_high, 0)];
+    dst[ix + 3] = src_rows[vgetq_lane_u64(offsets_high, 1)];
+  };
+
+  auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) {
+    uint32x4_t offset = vmlaq_u32(x, y, vq_src_stride);
+    uint64_t acc =
+        static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 0)]) |
+        (static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 1)]) << 32);
+    uint64x2_t rawsrc = vdupq_n_u64(acc);
+    acc = static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 2)]) |
+          (static_cast<uint64_t>(src_rows[vgetq_lane_u32(offset, 3)]) << 32);
+    rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
+    return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
+  };
+
+  auto load_src_into_floats_large = [&](uint32x4_t x, uint32x4_t y) {
+    uint64x2_t offset_low =
+        vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), v_src_stride);
+    uint64x2_t offset_high =
+        vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_stride);
+    uint64_t acc =
+        static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_low, 0)]) |
+        (static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_low, 1)]) << 32);
+    uint64x2_t rawsrc = vdupq_n_u64(acc);
+    acc =
+        static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_high, 0)]) |
+        (static_cast<uint64_t>(src_rows[vgetq_lane_u64(offset_high, 1)]) << 32);
+    rawsrc = vsetq_lane_u64(acc, rawsrc, 1);
+    return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc));
+  };
+
+  auto calculate_linear = [&](uint32_t x) {
+    auto load_floats = [&](uint32x4_t x, uint32x4_t y) {
+      if constexpr (IsLarge) {
+        return load_src_into_floats_large(x, y);
+      } else {
+        return load_src_into_floats_small(x, y);
+      }
+    };
+    auto &&[xf, yf] = calculate_coordinates(x);
+    // Truncating convert to int
+    uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax);
+    uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax);
+
+    // Get fractional part, or 0 if out of range
+    float32x4_t zero = vdupq_n_f32(0.F);
+    uint32x4_t x_in_range =
+        vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax));
+    uint32x4_t y_in_range =
+        vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax));
+    float32x4_t xfrac =
+        vbslq_f32(x_in_range, vsubq_f32(xf, vrndmq_f32(xf)), zero);
+    float32x4_t yfrac =
+        vbslq_f32(y_in_range, vsubq_f32(yf, vrndmq_f32(yf)), zero);
+
+    // x1 = x0 + 1, except if it's already xmax or out of range
+    uint32x4_t x1 = vsubq_u32(x0, x_in_range);
+    uint32x4_t y1 = vsubq_u32(y0, y_in_range);
+
+    // Calculate offsets from coordinates (y * stride + x)
+    // a: top left, b: top right, c: bottom left, d: bottom right
+    float32x4_t a = load_floats(x0, y0);
+    float32x4_t b = load_floats(x1, y0);
+    float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac);
+    float32x4_t c = load_floats(x0, y1);
+    float32x4_t d = load_floats(x1, y1);
+    float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac);
+    float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac);
+    return vminq_u32(vdupq_n_u32(0xFF), vcvtaq_u32_f32(result));
+  };
+
+  auto process_row = [&](uint32_t y, Columns<ScalarType> dst) {
     float dy = static_cast<float>(y);
     // Calculate half-transformed values at the first pixel (nominators)
     // tw =  T6*x + T7*y + T8
     // tx = (T0*x + T1*y + T2) / tw
     // ty = (T3*x + T4*y + T5) / tw
-    float x0 = transform[1] * dy + transform[2];
-    float y0 = transform[4] * dy + transform[5];
-    float w0 = transform[7] * dy + transform[8];
-    // The next few values can be calculated by adding the corresponding Tn*x
-    CoordVector tx0 =
-        vmlaq_f32(vdupq_n_f32(x0), x0123_, vdupq_n_f32(transform[0]));
-    CoordVector ty0 =
-        vmlaq_f32(vdupq_n_f32(y0), x0123_, vdupq_n_f32(transform[3]));
-    CoordVector tw0 =
-        vmlaq_f32(vdupq_n_f32(w0), x0123_, vdupq_n_f32(transform[6]));
-
-    // (Nearest) integer part of transformed coordinates
-    uint32x4_t xi, yi;
-
-    auto calculate_coordinates = [&](size_t x) {
-      float fx = static_cast<float>(x);
-      // Calculate half-transformed values from the first few pixel values, plus
-      // Tn*x, similarly to the one above
-      CoordVector tx = vaddq_f32(tx0, vdupq_n_f32(transform[0] * fx));
-      CoordVector ty = vaddq_f32(ty0, vdupq_n_f32(transform[3] * fx));
-      CoordVector tw = vaddq_f32(tw0, vdupq_n_f32(transform[6] * fx));
-
-      // Calculate inverse weight because division is expensive
-      CoordVector iw = vdivq_f32(vdupq_n_f32(1.F), tw);
-      // Calc and clamp coordinates to within the dimensions of the source image
-      CoordVector xf =
-          vmaxq_f32(vdupq_n_f32(0.F), vminq_f32(vmulq_f32(tx, iw), v_xmax_));
-      CoordVector yf =
-          vmaxq_f32(vdupq_n_f32(0.F), vminq_f32(vmulq_f32(ty, iw), v_ymax_));
-      // Rounding convert to int
-      xi = vcvtaq_u32_f32(xf);
-      yi = vcvtaq_u32_f32(yf);
-    };
-
-    auto large_vector_path = [&](size_t x) {
-      calculate_coordinates(x);
-      // Calculate offsets from coordinates (y * stride + x)
-      // To avoid losing precision, the final indices should be in 64 bits
-      uint64x2_t indices_low = vmlal_u32(vmovl_u32(vget_low_u32(xi)),
-                                         vget_low_u32(yi), v_src_stride_);
-      uint64x2_t indices_high =
-          vmlal_u32(vmovl_high_u32(xi), vget_high_u32(yi), v_src_stride_);
-      // Copy pixels from source
-      ptrdiff_t ix = static_cast<ptrdiff_t>(x);
-      dst[ix] = src_rows_[vgetq_lane_u64(indices_low, 0)];
-      dst[ix + 1] = src_rows_[vgetq_lane_u64(indices_low, 1)];
-      dst[ix + 2] = src_rows_[vgetq_lane_u64(indices_high, 0)];
-      dst[ix + 3] = src_rows_[vgetq_lane_u64(indices_high, 1)];
-    };
-
-    auto small_vector_path = [&](size_t x) {
-      calculate_coordinates(x);
-      // Calculate offsets from coordinates (y * stride + x)
-      // Use this path only when the final indices fit into 32 bits
-      uint32x4_t indices = vmlaq_u32(xi, yi, vq_src_stride_);
-      // Copy pixels from source
-      ptrdiff_t ix = static_cast<ptrdiff_t>(x);
-      dst[ix] = src_rows_[vgetq_lane_u32(indices, 0)];
-      dst[ix + 1] = src_rows_[vgetq_lane_u32(indices, 1)];
-      dst[ix + 2] = src_rows_[vgetq_lane_u32(indices, 2)];
-      dst[ix + 3] = src_rows_[vgetq_lane_u32(indices, 3)];
-    };
+    tx0 = vdupq_n_f32(transform[1] * dy + transform[2]);
+    ty0 = vdupq_n_f32(transform[4] * dy + transform[5]);
+    tw0 = vdupq_n_f32(transform[7] * dy + transform[8]);
 
-    LoopUnroll2<TryToAvoidTailLoop> loop{width, CoordVecTraits::num_lanes()};
-    if constexpr (IsLarge) {
-      loop.unroll_once(large_vector_path);
+    static const size_t kStep = VecTraits<float>::num_lanes();
+    LoopUnroll2<TryToAvoidTailLoop> loop{dst_width, kStep};
+    if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
+      if constexpr (IsLarge) {
+        loop.unroll_once([&](size_t x) {
+          vector_path_nearest_large(static_cast<uint32_t>(x), dst);
+        });
+      } else {
+        loop.unroll_once([&](size_t x) {
+          vector_path_nearest_small(static_cast<uint32_t>(x), dst);
+        });
+      }
+    } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) {
+      loop.unroll_four_times([&](size_t _x) {
+        uint32_t x = static_cast<uint32_t>(_x);
+        ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(_x)];
+        uint32x4_t res0 = calculate_linear(x);
+        x += kStep;
+        uint32x4_t res1 = calculate_linear(x);
+        uint16x8_t result16_0 = vuzp1q_u16(res0, res1);
+        x += kStep;
+        res0 = calculate_linear(x);
+        x += kStep;
+        res1 = calculate_linear(x);
+        uint16x8_t result16_1 = vuzp1q_u16(res0, res1);
+        vst1q_u8(p_dst, vuzp1q_u8(result16_0, result16_1));
+      });
+      loop.unroll_once([&](size_t x) {
+        ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        uint32x4_t res = calculate_linear(static_cast<uint32_t>(x));
+        p_dst[0] = vgetq_lane_u32(res, 0);
+        p_dst[1] = vgetq_lane_u32(res, 1);
+        p_dst[2] = vgetq_lane_u32(res, 2);
+        p_dst[3] = vgetq_lane_u32(res, 3);
+      });
     } else {
-      loop.unroll_once(small_vector_path);
+      static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST ||
+                        Inter == KLEIDICV_INTERPOLATION_LINEAR,
+                    ": Unknown interpolation type!");
     }
-  }
+  };
 
- private:
-  static constexpr float first_few_x[] = {0.F, 1.F, 2.F, 3.F};
-  Rows<const ScalarType> src_rows_;
-  size_t src_height_;
-  uint32x2_t v_src_stride_;
-  uint32x4_t vq_src_stride_;
-  CoordVector x0123_, v_xmax_, v_ymax_;
-};  // end of class WarpPerspective<uint8_t>
+  for (size_t y = y_begin; y < y_end; ++y) {
+    process_row(y, dst_rows.as_columns());
+    ++dst_rows;
+  }
+}
 
 // Most of the complexity comes from parameter checking.
 // NOLINTBEGIN(readability-function-cognitive-complexity)
@@ -143,8 +239,8 @@ kleidicv_error_t warp_perspective_stripe(
     const T *src, size_t src_stride, size_t src_width, size_t src_height,
     T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
     size_t y_begin, size_t y_end, const float transformation[9],
-    size_t channels, kleidicv_interpolation_type_t, kleidicv_border_type_t,
-    const T *) {
+    size_t channels, kleidicv_interpolation_type_t interpolation,
+    kleidicv_border_type_t, const T *) {
   CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
   CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
   CHECK_POINTERS(transformation);
@@ -153,35 +249,50 @@ kleidicv_error_t warp_perspective_stripe(
 
   // Calculating in float32_t will only be precise until 24 bits, and
   // multiplication can only be done with 32x32 bits
-  if (src_stride >= (1ULL << 32) || src_width >= (1ULL << 24) ||
-      src_height >= (1ULL << 24)) {
+  if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
+      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) ||
+      src_stride >= (1ULL << 32)) {
     return KLEIDICV_ERROR_RANGE;
   }
 
   Rows<const T> src_rows{src, src_stride, channels};
   Rows<T> dst_rows{dst, dst_stride, channels};
-  Rectangle rect{dst_width, dst_height};
   dst_rows += y_begin;
 
-  if (KLEIDICV_LIKELY(src_stride * src_height < (1ULL << 32))) {
-    WarpPerspective<T, false> operation{src_rows, src_width, src_height};
-    for (size_t y = y_begin; y < y_end; ++y) {
-      operation.process_row(y, rect.width(), transformation,
-                            dst_rows.as_columns());
-      ++dst_rows;
+  if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
+    switch (interpolation) {
+      case KLEIDICV_INTERPOLATION_NEAREST:
+        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_NEAREST, true>(
+            src_rows, src_width, src_height, transformation, dst_rows,
+            dst_width, y_begin, y_end);
+        break;
+      case KLEIDICV_INTERPOLATION_LINEAR:
+        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_LINEAR, true>(
+            src_rows, src_width, src_height, transformation, dst_rows,
+            dst_width, y_begin, y_end);
+        break;
+      default:
+        break;  // GCOVR_EXCL_LINE
     }
   } else {
-    WarpPerspective<T, true> operation{src_rows, src_width, src_height};
-    for (size_t y = y_begin; y < y_end; ++y) {
-      operation.process_row(y, rect.width(), transformation,
-                            dst_rows.as_columns());
-      ++dst_rows;
+    switch (interpolation) {
+      case KLEIDICV_INTERPOLATION_NEAREST:
+        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_NEAREST, false>(
+            src_rows, src_width, src_height, transformation, dst_rows,
+            dst_width, y_begin, y_end);
+        break;
+      case KLEIDICV_INTERPOLATION_LINEAR:
+        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_LINEAR, false>(
+            src_rows, src_width, src_height, transformation, dst_rows,
+            dst_width, y_begin, y_end);
+        break;
+      default:
+        break;  // GCOVR_EXCL_LINE
     }
   }
-
+  // NOLINTEND(readability-function-cognitive-complexity)
   return KLEIDICV_OK;
 }
-// NOLINTEND(readability-function-cognitive-complexity)
 
 #define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(type)                            \
   template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t                           \
@@ -195,4 +306,3 @@ kleidicv_error_t warp_perspective_stripe(
 KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(uint8_t);
 
 }  // namespace kleidicv::neon
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
diff --git a/kleidicv/src/transform/warp_perspective_sc.h b/kleidicv/src/transform/warp_perspective_sc.h
index aaa358ec7..a790a6acf 100644
--- a/kleidicv/src/transform/warp_perspective_sc.h
+++ b/kleidicv/src/transform/warp_perspective_sc.h
@@ -11,8 +11,8 @@
 #include "kleidicv/sve2.h"
 #include "kleidicv/traits.h"
 #include "kleidicv/transform/warp_perspective.h"
+#include "kleidicv/types.h"
 
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 namespace KLEIDICV_TARGET_NAMESPACE {
 
 // Gather load is not available in streaming mode, and in general random access
@@ -36,161 +36,248 @@ namespace KLEIDICV_TARGET_NAMESPACE {
 //      xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8)
 //      yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8)
 //
-template <typename ScalarType>
-class WarpPerspective;
-
-template <>
-class WarpPerspective<uint8_t> {
- public:
-  using ScalarType = uint8_t;
-  using DstVecTraits = VecTraits<ScalarType>;
-  using CoordVecTraits = VecTraits<float>;
-  using CoordVector = CoordVecTraits::VectorType;
-
-  WarpPerspective(Rows<const ScalarType> src_rows, size_t src_width,
-                  size_t src_height,
-                  const float tr_[9]) KLEIDICV_STREAMING_COMPATIBLE
-      : src_rows_{src_rows},
-        src_width_{src_width},
-        src_height_{src_height},
-        tr_{tr_} {}
-
-  void process_row(size_t y, size_t width, Columns<ScalarType> dst) {
-    svfloat32_t svzero = svdup_n_f32(0.F);
-    svfloat32_t sv_xmax = svdup_n_f32(static_cast<float>(src_width_ - 1));
-    svfloat32_t sv_ymax = svdup_n_f32(static_cast<float>(src_height_ - 1));
-    svuint32_t sv_src_stride = svdup_n_u32(src_rows_.stride());
-    svbool_t pg_all64 = svptrue_b64();
-    svbool_t pg_all32 = svptrue_b32();
-    svbool_t pg_all8 = svptrue_b8();
-    float fy = static_cast<float>(y);
-    svfloat32_t vx0 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1));
-    // (Nearest) integer part of transformed coordinates
-    svuint32_t xi, yi;
-    // Calculate half-transformed base values (nominators) at the first vector
-    // (x=0)
-    //     tw =  T6*x + T7*y + T8
-    //     tx = (T0*x + T1*y + T2) / tw
-    //     ty = (T3*x + T4*y + T5) / tw
-    svfloat32_t tw0 = svdup_n_f32(tr_[7] * fy + tr_[8]);
-    svfloat32_t tx0 = svdup_n_f32(tr_[1] * fy + tr_[2]);
-    svfloat32_t ty0 = svdup_n_f32(tr_[4] * fy + tr_[5]);
-    auto calculate_coordinates = [&](size_t x) {
-      svfloat32_t vx = svadd_n_f32_x(pg_all32, vx0, static_cast<float>(x));
-      svfloat32_t tw = svmla_x(pg_all32, tw0, vx, tr_[6]);
-      svfloat32_t tx = svmla_x(pg_all32, tx0, vx, tr_[0]);
-      svfloat32_t ty = svmla_x(pg_all32, ty0, vx, tr_[3]);
-
-      // Calculate inverse weight because division is expensive
-      svfloat32_t iw = svdiv_f32_x(pg_all32, svdup_n_f32(1.F), tw);
-      // Calc and clamp coordinates to within the dimensions of the source image
-      svfloat32_t dx =
-          svmax_x(pg_all32, svzero,
-                  svmin_x(pg_all32, svmad_x(pg_all32, tx, iw, 0.5F), sv_xmax));
-      svfloat32_t dy =
-          svmax_x(pg_all32, svzero,
-                  svmin_x(pg_all32, svmad_x(pg_all32, ty, iw, 0.5F), sv_ymax));
-
-      xi = svcvt_u32_f32_x(pg_all32, dx);
-      yi = svcvt_u32_f32_x(pg_all32, dy);
-    };
 
-    auto calculate_and_load_64bits = [&](svbool_t pg_b, svbool_t pg_t,
-                                         size_t x) {
-      calculate_coordinates(x);
-      svuint64_t indices_b = svmlalb(svmovlb(xi), yi, sv_src_stride);
-      svuint64_t indices_t = svmlalt(svmovlt(xi), yi, sv_src_stride);
-      // Copy pixels from source
-      svuint64_t result_b =
-          svld1ub_gather_offset_u64(pg_b, &src_rows_[0], indices_b);
-      svuint64_t result_t =
-          svld1ub_gather_offset_u64(pg_t, &src_rows_[0], indices_t);
-      return svtrn1_u32(svreinterpret_u32_u64(result_b),
-                        svreinterpret_u32_u64(result_t));
-    };
+template <typename ScalarType, kleidicv_interpolation_type_t Inter,
+          bool IsLarge>
+void warp_perspective_operation(Rows<const ScalarType> src_rows,
+                                size_t src_width, size_t src_height,
+                                const float transform[9],
+                                Rows<ScalarType> dst_rows, size_t dst_width,
+                                size_t y_begin, size_t y_end) {
+  svbool_t pg_all64 = svptrue_b64();
+  svbool_t pg_all32 = svptrue_b32();
+  svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1));
+  svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
+  svuint32_t sv_ymax = svdup_n_u32(src_height - 1);
+  svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride());
 
-    auto calculate_and_load_32bits = [&](svbool_t pg, size_t x) {
-      calculate_coordinates(x);
-      svuint32_t indices = svmla_x(pg, xi, yi, sv_src_stride);
-      return svld1ub_gather_offset_u32(pg, &src_rows_[0], indices);
-    };
+  svfloat32_t T0 = svdup_n_f32(transform[0]);
+  svfloat32_t T3 = svdup_n_f32(transform[3]);
+  svfloat32_t T6 = svdup_n_f32(transform[6]);
+  svfloat32_t tx0, ty0, tw0;
+  svfloat32_t xf, yf;
+  svfloat32_t xmaxf = svdup_n_f32(static_cast<float>(src_width - 1));
+  svfloat32_t ymaxf = svdup_n_f32(static_cast<float>(src_height - 1));
+  svuint32_t xi, yi;
 
-    auto vector_path_generic = [&](size_t x, size_t x_max) {
-      size_t length = x_max - x;
-      svbool_t pg_b = svwhilelt_b64(0ULL, (length + 1) / 2);
-      svbool_t pg_t = svwhilelt_b64(0ULL, length / 2);
-      svbool_t pg = svwhilelt_b32(0ULL, length);
-      // To avoid losing precision, the final indices use 64 bits
-      svuint32_t result = calculate_and_load_64bits(pg_b, pg_t, x);
-      svst1b_u32(pg, &dst[0], result);
-      dst += static_cast<ptrdiff_t>(length);
-    };
+  svbool_t pg32 = pg_all32;
+  svbool_t pg64_b = pg_all64, pg64_t = pg_all64;
+
+  const size_t kStep = VecTraits<float>::num_lanes();
+
+  auto calculate_coordinates = [&](size_t x) {
+    svfloat32_t vx = svadd_n_f32_x(pg_all32, sv_0123, static_cast<float>(x));
+    // Calculate half-transformed values from the first few pixel values,
+    // plus Tn*x, similarly to the one above
+    // Calculate inverse weight because division is expensive
+    svfloat32_t iw =
+        svdiv_f32_x(pg_all32, svdup_n_f32(1.F), svmla_x(pg_all32, tw0, vx, T6));
+    svfloat32_t tx = svmla_x(pg_all32, tx0, vx, T0);
+    svfloat32_t ty = svmla_x(pg_all32, ty0, vx, T3);
+
+    // Calculate coordinates into the source image
+    xf = svmul_f32_x(pg_all32, tx, iw);
+    yf = svmul_f32_x(pg_all32, ty, iw);
+  };
 
-    auto vector_path_4x_large = [&](size_t x) {
-      // Copy pixels from source
-      // To avoid losing precision, the final indices use 64 bits
-      svuint32_t res32_0 = calculate_and_load_64bits(pg_all64, pg_all64, x);
-      x += CoordVecTraits::num_lanes();
-      svuint32_t res32_1 = calculate_and_load_64bits(pg_all64, pg_all64, x);
-      svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                      svreinterpret_u16_u32(res32_1));
-      x += CoordVecTraits::num_lanes();
-      res32_0 = calculate_and_load_64bits(pg_all64, pg_all64, x);
-      x += CoordVecTraits::num_lanes();
-      res32_1 = calculate_and_load_64bits(pg_all64, pg_all64, x);
-      svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                      svreinterpret_u16_u32(res32_1));
-      svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0),
-                                   svreinterpret_u8_u16(result1));
-      svst1(pg_all8, &dst[0], result);
-      dst += static_cast<ptrdiff_t>(4 * CoordVecTraits::num_lanes());
+  auto calculate_nearest_coordinates = [&](size_t x) {
+    calculate_coordinates(x);
+    // Round to the nearest integer, clamp it to within the dimensions of the
+    // source image (negative values are already saturated to 0)
+    xi = svmin_x(pg_all32,
+                 svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)),
+                 sv_xmax);
+    yi = svmin_x(pg_all32,
+                 svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)),
+                 sv_ymax);
+  };
+
+  auto load_small = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
+    svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
+    return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets);
+  };
+
+  auto load_large = [&](svbool_t pg_b, svbool_t pg_t, svuint32_t x,
+                        svuint32_t y) {
+    // Calculate offsets from coordinates (y * stride + x)
+    // To avoid losing precision, the final offsets should be in 64 bits
+    svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride);
+    svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride);
+    // Copy pixels from source
+    svuint64_t result_b =
+        svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
+    svuint64_t result_t =
+        svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
+    return svtrn1_u32(svreinterpret_u32_u64(result_b),
+                      svreinterpret_u32_u64(result_t));
+  };
+
+  auto vector_path_nearest_4x = [&](size_t x, Columns<ScalarType> dst) {
+    auto load_source = [&](svuint32_t x, svuint32_t y) {
+      if constexpr (IsLarge) {
+        return load_large(pg_all64, pg_all64, x, y);
+      } else {
+        return load_small(pg_all32, x, y);
+      }
     };
+    ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
+    calculate_nearest_coordinates(x);
+    svuint32_t res32_0 = load_source(xi, yi);
+    x += kStep;
+    calculate_nearest_coordinates(x);
+    svuint32_t res32_1 = load_source(xi, yi);
+    svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                    svreinterpret_u16_u32(res32_1));
+    x += kStep;
+    calculate_nearest_coordinates(x);
+    res32_0 = load_source(xi, yi);
+    x += kStep;
+    calculate_nearest_coordinates(x);
+    res32_1 = load_source(xi, yi);
+    svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
+                                    svreinterpret_u16_u32(res32_1));
+    svuint8_t result =
+        svuzp1_u8(svreinterpret_u8_u16(result0), svreinterpret_u8_u16(result1));
+    svst1(svptrue_b8(), p_dst, result);
+  };
+
+  auto vector_path_nearest_tail = [&](size_t x, size_t x_max,
+                                      Columns<ScalarType> dst) {
+    size_t length = x_max - x;
+    svbool_t pg_b = svwhilelt_b64(0ULL, (length + 1) / 2);
+    svbool_t pg_t = svwhilelt_b64(0ULL, length / 2);
+    svbool_t pg = svwhilelt_b32(0ULL, length);
+    // To avoid losing precision, the final indices use 64 bits
+    calculate_nearest_coordinates(x);
+    svuint32_t result = load_large(pg_b, pg_t, xi, yi);
+    svst1b_u32(pg, &dst[static_cast<ptrdiff_t>(x)], result);
+  };
 
-    auto vector_path_4x_small = [&](size_t x) {
-      // Copy pixels from source
-      // Use this path only when the final indices fit into 32 bits
-      svuint32_t res32_0 = calculate_and_load_32bits(pg_all32, x);
-      x += CoordVecTraits::num_lanes();
-      svuint32_t res32_1 = calculate_and_load_32bits(pg_all32, x);
-      svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                      svreinterpret_u16_u32(res32_1));
-      x += CoordVecTraits::num_lanes();
-      res32_0 = calculate_and_load_32bits(pg_all32, x);
-      x += CoordVecTraits::num_lanes();
-      res32_1 = calculate_and_load_32bits(pg_all32, x);
-      svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
-                                      svreinterpret_u16_u32(res32_1));
-      svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0),
-                                   svreinterpret_u8_u16(result1));
-      svst1(pg_all8, &dst[0], result);
-      dst += static_cast<ptrdiff_t>(4 * CoordVecTraits::num_lanes());
+  auto calculate_linear = [&](size_t x) {
+    auto load_source = [&](svuint32_t x, svuint32_t y) {
+      if constexpr (IsLarge) {
+        return load_large(pg64_b, pg64_t, x, y);
+      } else {
+        return load_small(pg32, x, y);
+      }
     };
 
-    LoopUnroll2 loop{width, CoordVecTraits::num_lanes()};
-    if (KLEIDICV_LIKELY(src_rows_.stride() * src_height_ < (1ULL << 32))) {
-      loop.unroll_four_times(vector_path_4x_small);
+    calculate_coordinates(x);
+    // Take the integer part, clamp it to within the dimensions of the
+    // source image (negative values are already saturated to 0)
+    svuint32_t x0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, xf, xmaxf));
+    svuint32_t y0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, yf, ymaxf));
+
+    // Get fractional part, or 0 if out of range
+    svbool_t x_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, xf, 0.F),
+                                  svcmplt_f32(pg_all32, xf, xmaxf));
+    svbool_t y_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, yf, 0.F),
+                                  svcmplt_f32(pg_all32, yf, ymaxf));
+    svfloat32_t xfrac = svsel_f32(
+        x_in_range, svsub_f32_x(pg_all32, xf, svrintm_x(pg_all32, xf)),
+        svdup_n_f32(0.F));
+    svfloat32_t yfrac = svsel_f32(
+        y_in_range, svsub_f32_x(pg_all32, yf, svrintm_x(pg_all32, yf)),
+        svdup_n_f32(0.F));
+
+    // x1 = x0 + 1, except if it's already xmax or out of range
+    svuint32_t x1 = svsel_u32(x_in_range, svadd_n_u32_x(pg_all32, x0, 1), x0);
+    svuint32_t y1 = svsel_u32(y_in_range, svadd_n_u32_x(pg_all32, y0, 1), y0);
+
+    // Calculate offsets from coordinates (y * stride + x)
+    // a: top left, b: top right, c: bottom left, d: bottom right
+    svfloat32_t a = svcvt_f32_u32_x(pg_all32, load_source(x0, y0));
+    svfloat32_t b = svcvt_f32_u32_x(pg_all32, load_source(x1, y0));
+    svfloat32_t line0 =
+        svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
+    svfloat32_t c = svcvt_f32_u32_x(pg_all32, load_source(x0, y1));
+    svfloat32_t d = svcvt_f32_u32_x(pg_all32, load_source(x1, y1));
+    svfloat32_t line1 =
+        svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
+    svfloat32_t result = svmla_f32_x(
+        pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
+    return svmin_u32_x(
+        pg_all32, svdup_n_u32(0xFF),
+        svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F)));
+  };
+
+  auto process_row = [&](size_t y, Columns<ScalarType> dst) {
+    float fy = static_cast<float>(y);
+    // Calculate half-transformed values at the first pixel (nominators)
+    // tw =  T6*x + T7*y + T8
+    // tx = (T0*x + T1*y + T2) / tw
+    // ty = (T3*x + T4*y + T5) / tw
+    tx0 = svdup_n_f32(fmaf(transform[1], fy, transform[2]));
+    ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5]));
+    tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8]));
+
+    pg32 = pg_all32;
+    pg64_b = pg64_t = pg_all64;
+
+    LoopUnroll2 loop{dst_width, kStep};
+    if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
+      loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); });
+      loop.unroll_once(
+          [&](size_t x) { vector_path_nearest_tail(x, x + kStep, dst); });
+      loop.remaining([&](size_t x, size_t length) {
+        vector_path_nearest_tail(x, length, dst);
+      });
+    } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) {
+      loop.unroll_four_times([&](size_t x) {
+        ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svuint32_t res0 = calculate_linear(x);
+        x += kStep;
+        svuint32_t res1 = calculate_linear(x);
+        svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0),
+                                           svreinterpret_u16_u32(res1));
+        x += kStep;
+        res0 = calculate_linear(x);
+        x += kStep;
+        res1 = calculate_linear(x);
+        svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0),
+                                           svreinterpret_u16_u32(res1));
+        svst1_u8(svptrue_b8(), p_dst,
+                 svuzp1_u8(svreinterpret_u8_u16(result16_0),
+                           svreinterpret_u8_u16(result16_1)));
+      });
+      loop.unroll_once([&](size_t x) {
+        ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        svuint32_t result = calculate_linear(x);
+        svst1b_u32(pg_all32, p_dst, result);
+      });
+      loop.remaining([&](size_t x, size_t x_max) {
+        ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
+        size_t length = x_max - x;
+        pg32 = svwhilelt_b32(0ULL, length);
+        if constexpr (IsLarge) {
+          pg64_b = svwhilelt_b64(0ULL, (length + 1) / 2);
+          pg64_t = svwhilelt_b64(0ULL, length / 2);
+        }
+        svuint32_t result = calculate_linear(x);
+        svst1b_u32(pg32, p_dst, result);
+      });
     } else {
-      loop.unroll_four_times(vector_path_4x_large);
+      static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST ||
+                        Inter == KLEIDICV_INTERPOLATION_LINEAR,
+                    ": Unknown interpolation type!");
     }
-    loop.unroll_once([&](size_t x) {
-      vector_path_generic(x, x + CoordVecTraits::num_lanes());
-    });
-    loop.remaining(vector_path_generic);
-  }
+  };
 
- private:
-  Rows<const ScalarType> src_rows_;
-  size_t src_width_, src_height_;
-  const float *tr_;
-};  // end of class WarpPerspective<uint8_t>
+  for (size_t y = y_begin; y < y_end; ++y) {
+    process_row(y, dst_rows.as_columns());
+    ++dst_rows;
+  }
+}
 
+// Most of the complexity comes from parameter checking.
+// NOLINTBEGIN(readability-function-cognitive-complexity)
 template <typename T>
 KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc(
     const T *src, size_t src_stride, size_t src_width, size_t src_height,
     T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
     size_t y_begin, size_t y_end, const float transformation[9],
-    size_t channels, kleidicv_interpolation_type_t, kleidicv_border_type_t,
-    const T *) {
+    size_t channels, kleidicv_interpolation_type_t interpolation,
+    kleidicv_border_type_t, const T *) {
   CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
   CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
   CHECK_POINTERS(transformation);
@@ -199,24 +286,53 @@ KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc(
 
   // Calculating in float32_t will only be precise until 24 bits, and
   // multiplication can only be done with 32x32 bits
-  if (src_stride >= (1ULL << 32) || src_width >= (1ULL << 24) ||
-      src_height >= (1ULL << 24)) {
+  if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
+      dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) ||
+      src_stride >= (1ULL << 32)) {
     return KLEIDICV_ERROR_RANGE;
   }
 
   Rows<const T> src_rows{src, src_stride, channels};
   Rows<T> dst_rows{dst, dst_stride, channels};
-  WarpPerspective<T> operation{src_rows, src_width, src_height, transformation};
   Rectangle rect{dst_width, dst_height};
 
   dst_rows += y_begin;
-  for (size_t y = y_begin; y < y_end; ++y) {
-    operation.process_row(y, rect.width(), dst_rows.as_columns());
-    ++dst_rows;
+
+  if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) {
+    switch (interpolation) {
+      case KLEIDICV_INTERPOLATION_NEAREST:
+        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_NEAREST, true>(
+            src_rows, src_width, src_height, transformation, dst_rows,
+            dst_width, y_begin, y_end);
+        break;
+      case KLEIDICV_INTERPOLATION_LINEAR:
+        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_LINEAR, true>(
+            src_rows, src_width, src_height, transformation, dst_rows,
+            dst_width, y_begin, y_end);
+        break;
+      default:
+        break;  // GCOVR_EXCL_LINE
+    }
+  } else {
+    switch (interpolation) {
+      case KLEIDICV_INTERPOLATION_NEAREST:
+        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_NEAREST, false>(
+            src_rows, src_width, src_height, transformation, dst_rows,
+            dst_width, y_begin, y_end);
+        break;
+      case KLEIDICV_INTERPOLATION_LINEAR:
+        warp_perspective_operation<T, KLEIDICV_INTERPOLATION_LINEAR, false>(
+            src_rows, src_width, src_height, transformation, dst_rows,
+            dst_width, y_begin, y_end);
+        break;
+      default:
+        break;  // GCOVR_EXCL_LINE
+    }
   }
 
   return KLEIDICV_OK;
 }
+// NOLINTEND(readability-function-cognitive-complexity)
 
 #define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE_SC(type)                         \
   template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t                           \
@@ -232,4 +348,3 @@ KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE_SC(uint8_t);
 #endif
 
 }  // namespace KLEIDICV_TARGET_NAMESPACE
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
diff --git a/kleidicv/src/transform/warp_perspective_sve2.cpp b/kleidicv/src/transform/warp_perspective_sve2.cpp
index 8310053d3..5e3649050 100644
--- a/kleidicv/src/transform/warp_perspective_sve2.cpp
+++ b/kleidicv/src/transform/warp_perspective_sve2.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include "kleidicv/ctypes.h"
 #include "warp_perspective_sc.h"
 
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 namespace kleidicv::sve2 {
 
 template <typename T>
@@ -32,4 +32,3 @@ kleidicv_error_t warp_perspective_stripe(
 KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(uint8_t);
 
 }  // namespace kleidicv::sve2
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
index 51189095f..0bd45c7b1 100644
--- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
+++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
@@ -5,6 +5,7 @@
 #ifndef KLEIDICV_THREAD_H
 #define KLEIDICV_THREAD_H
 
+#include "kleidicv/ctypes.h"
 #include "kleidicv/kleidicv.h"
 
 #ifdef __cplusplus
diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp
index b6fcecc6b..166da5d28 100644
--- a/kleidicv_thread/src/kleidicv_thread.cpp
+++ b/kleidicv_thread/src/kleidicv_thread.cpp
@@ -10,6 +10,7 @@
 #include <vector>
 
 #include "kleidicv/arithmetics/rotate.h"
+#include "kleidicv/ctypes.h"
 #include "kleidicv/filters/blur_and_downsample.h"
 #include "kleidicv/filters/gaussian_blur.h"
 #include "kleidicv/filters/scharr.h"
@@ -697,7 +698,6 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u8(
   return parallel_batches(callback, mt, dst_height);
 }
 
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 kleidicv_error_t kleidicv_thread_warp_perspective_u8(
     const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
     uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
@@ -706,7 +706,7 @@ kleidicv_error_t kleidicv_thread_warp_perspective_u8(
     kleidicv_border_type_t border_type, const uint8_t *border_value,
     kleidicv_thread_multithreading mt) {
   if (!kleidicv::warp_perspective_is_implemented<uint8_t>(
-          dst_width, interpolation, border_type, channels)) {
+          dst_width, channels, interpolation, border_type)) {
     return KLEIDICV_ERROR_NOT_IMPLEMENTED;
   }
 
@@ -718,4 +718,3 @@ kleidicv_error_t kleidicv_thread_warp_perspective_u8(
   };
   return parallel_batches(callback, mt, dst_height);
 }
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt
index 48060a762..c3bde40f5 100755
--- a/scripts/benchmark/benchmarks.txt
+++ b/scripts/benchmark/benchmarks.txt
@@ -80,8 +80,10 @@ InRange_F32: opencv_perf_core '*inRangeScalar/*' '($PIXEL_FORMAT, 32FC1, 1, 2)'
 
 Remap_S16_U8: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)'
 Remap_S16Point5_U8: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)'
-WarpPerspective: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)'
-WarpPerspectiveNear: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)'
+WarpPerspective_Nearest: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)'
+WarpPerspectiveNear_Nearest: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)'
+WarpPerspective_Linear: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 8UC1)'
+WarpPerspectiveNear_Linear: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 8UC1)'
 
 BlurAndDownsample: opencv_perf_imgproc '*pyrDown/*' '($PIXEL_FORMAT, 8UC1)'
 
diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp
index e36bfbfe5..6f09b8855 100644
--- a/test/api/test_thread.cpp
+++ b/test/api/test_thread.cpp
@@ -9,6 +9,7 @@
 
 #include "framework/array.h"
 #include "framework/generator.h"
+#include "kleidicv/ctypes.h"
 #include "kleidicv/kleidicv.h"
 #include "kleidicv_thread/kleidicv_thread.h"
 #include "multithreading_fake.h"
@@ -247,7 +248,9 @@ class Thread : public testing::TestWithParam<P> {
             typename... Args>
   void check_warp_perspective(SingleThreadedFunc single_threaded_func,
                               MultithreadedFunc multithreaded_func,
-                              size_t channels, Args... args) {
+                              size_t channels,
+                              kleidicv_interpolation_type_t interpolation,
+                              Args... args) {
     unsigned test_width = 0, height = 0, thread_count = 0;
     std::tie(test_width, height, thread_count) = GetParam();
     const unsigned src_width = 300, src_height = 300;
@@ -265,12 +268,13 @@ class Thread : public testing::TestWithParam<P> {
     // clang-format on
     kleidicv_error_t single_result = single_threaded_func(
         src.data(), src.stride(), src_width, src_height, dst_single.data(),
-        dst_single.stride(), width, height, transform, channels, args...);
+        dst_single.stride(), width, height, transform, channels, interpolation,
+        args...);
 
     kleidicv_error_t multi_result = multithreaded_func(
         src.data(), src.stride(), src_width, src_height, dst_multi.data(),
-        dst_multi.stride(), width, height, transform, channels, args...,
-        get_multithreading_fake(thread_count));
+        dst_multi.stride(), width, height, transform, channels, interpolation,
+        args..., get_multithreading_fake(thread_count));
 
     EXPECT_EQ(KLEIDICV_OK, single_result);
     EXPECT_EQ(KLEIDICV_OK, multi_result);
@@ -625,7 +629,6 @@ TEST_P(Thread, remap_s16point5_u8_not_implemented) {
       border_value);
 }
 
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 TEST_P(Thread, warp_perspective_u8_border_replicate) {
   const uint8_t border_value[4] = {};
   check_warp_perspective<uint8_t>(kleidicv_warp_perspective_u8,
@@ -634,19 +637,29 @@ TEST_P(Thread, warp_perspective_u8_border_replicate) {
                                   KLEIDICV_BORDER_TYPE_REPLICATE, border_value);
 }
 
+TEST_P(Thread, warp_perspective_u8_linear_border_replicate) {
+  const uint8_t border_value[4] = {};
+  check_warp_perspective<uint8_t>(kleidicv_warp_perspective_u8,
+                                  kleidicv_thread_warp_perspective_u8, 1,
+                                  KLEIDICV_INTERPOLATION_LINEAR,
+                                  KLEIDICV_BORDER_TYPE_REPLICATE, border_value);
+}
+
 TEST_P(Thread, warp_perspective_u8_not_implemented) {
   const uint8_t border_value[4] = {};
-  check_warp_perspective_not_implemented<uint8_t>(
-      kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_LINEAR,
-      KLEIDICV_BORDER_TYPE_REPLICATE, border_value);
   check_warp_perspective_not_implemented<uint8_t>(
       kleidicv_thread_warp_perspective_u8, 2, KLEIDICV_INTERPOLATION_NEAREST,
       KLEIDICV_BORDER_TYPE_REPLICATE, border_value);
   check_warp_perspective_not_implemented<uint8_t>(
       kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_NEAREST,
       KLEIDICV_BORDER_TYPE_CONSTANT, border_value);
+  check_warp_perspective_not_implemented<uint8_t>(
+      kleidicv_thread_warp_perspective_u8, 2, KLEIDICV_INTERPOLATION_LINEAR,
+      KLEIDICV_BORDER_TYPE_REPLICATE, border_value);
+  check_warp_perspective_not_implemented<uint8_t>(
+      kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_LINEAR,
+      KLEIDICV_BORDER_TYPE_CONSTANT, border_value);
 }
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
 
 TEST_P(Thread, SobelHorizontal1Channel) {
   check_unary_op<uint8_t, int16_t>(kleidicv_sobel_3x3_horizontal_s16_u8,
diff --git a/test/api/test_warp_perspective.cpp b/test/api/test_warp_perspective.cpp
index 60c457f8f..b7b81ba38 100644
--- a/test/api/test_warp_perspective.cpp
+++ b/test/api/test_warp_perspective.cpp
@@ -4,13 +4,15 @@
 
 #include <gtest/gtest.h>
 
+#include <cmath>
+#include <limits>
+
 #include "framework/array.h"
 #include "framework/generator.h"
 #include "framework/utils.h"
 #include "kleidicv/ctypes.h"
 #include "kleidicv/kleidicv.h"
-
-#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
+#include "kleidicv/transform/warp_perspective.h"
 
 // clang-format off
 static const float transform_identity[] = {
@@ -32,19 +34,34 @@ static const float transform_small[] = {
 };
 // clang-format on
 
+// Perspective calculation in float greatly amplifies any small errors coming
+// from precision innaccuracies, let it be Fused-Multiply-Add or just the
+// limitation of the single precision float.
+// Taking the fractional part of a value in the thousands' range decreases
+// the precision by 3 decimal digits, and the single precision float only has
+// 7 decimal digits. Doing a linear interpolation between 0 and 255 would
+// decrease it even more, but ensuring that neighbouring pixels have
+// neighbouring values decreases this effect by more than 2 decimal digits so it
+// can be expected that the error won't be bigger than 1.
 template <class ScalarType>
-static void random_initializer(test::Array2D<ScalarType> &source) {
-  test::PseudoRandomNumberGenerator<ScalarType> generator;
-  source.fill(generator);
+static void sequential_initializer(test::Array2D<ScalarType> &source) {
+  for (int64_t y = 0; y < static_cast<int64_t>(source.height()); ++y) {
+    for (int64_t x = 0; x < static_cast<int64_t>(source.width()); ++x) {
+      const int64_t kMaxVal = std::numeric_limits<ScalarType>::max();
+      *source.at(y, x) = abs((x + y) % (2 * kMaxVal + 1) - kMaxVal);
+    }
+  }
 }
 
 template <class ScalarType>
-class WarpPerspective : public testing::Test {
+class WarpPerspectiveNearest : public testing::Test {
  public:
-  static void test(
-      size_t src_w, size_t src_h, size_t dst_w, size_t dst_h,
-      const float transform[9], size_t channels, size_t padding,
-      void (*initializer)(test::Array2D<ScalarType> &) = random_initializer) {
+  static void test_stripe(size_t src_w, size_t src_h, size_t dst_w,
+                          size_t dst_h, size_t y_begin, size_t y_end,
+                          const float transform[9], size_t channels,
+                          size_t padding,
+                          void (*initializer)(test::Array2D<ScalarType> &) =
+                              sequential_initializer) {
     size_t src_total_width = channels * src_w;
     size_t dst_total_width = channels * dst_w;
 
@@ -54,24 +71,61 @@ class WarpPerspective : public testing::Test {
                                        channels};
 
     initializer(source);
-    actual.fill(42);
-
-    calculate_expected(source, transform, expected);
+    for (size_t row = y_begin; row < y_end; ++row) {
+      for (size_t column = 0; column < actual.width(); ++column) {
+        *actual.at(row, column) = 42;
+      }
+    }
 
-    ASSERT_EQ(KLEIDICV_OK, kleidicv_warp_perspective_u8(
-                               source.data(), source.stride(), source.width(),
-                               source.height(), actual.data(), actual.stride(),
-                               actual.width(), actual.height(), transform,
-                               channels, KLEIDICV_INTERPOLATION_NEAREST,
-                               KLEIDICV_BORDER_TYPE_REPLICATE, {}));
+    calculate_expected(source, y_begin, y_end, transform, expected);
+
+    ASSERT_EQ(
+        KLEIDICV_OK,
+        kleidicv_warp_perspective_stripe_u8(
+            source.data(), source.stride(), source.width(), source.height(),
+            actual.data(), actual.stride(), actual.width(), actual.height(),
+            y_begin, y_end, transform, channels, KLEIDICV_INTERPOLATION_NEAREST,
+            KLEIDICV_BORDER_TYPE_REPLICATE, {}));
+
+    ScalarType threshold = 1;
+    for (size_t row = y_begin; row < y_end; ++row) {
+      for (size_t column = 0; column < expected.width(); ++column) {
+        const ScalarType *lhs = expected.at(row, column);
+        const ScalarType *rhs = actual.at(row, column);
+        if (!lhs || !rhs) {
+          GTEST_FAIL() << "Nullptr at (row=" << row << ", column=" << column
+                       << "): " << reinterpret_cast<const void *>(lhs) << " , "
+                       << reinterpret_cast<const void *>(lhs) << std::endl;
+        }
+        ScalarType error;
+        if constexpr (std::is_integral<ScalarType>::value) {
+          error = static_cast<ScalarType>(
+              abs(static_cast<int64_t>(lhs[0]) - static_cast<int64_t>(rhs[0])));
+        } else {
+          error = std::abs(lhs[0] - rhs[0]);
+        }
+        if (error > threshold) {
+          GTEST_FAIL() << "Mismatch at (row=" << row << ", col=" << column
+                       << "): " << +(expected).at(row, column)[0] << " vs "
+                       << +(actual).at(row, column)[0] << "." << std::endl;
+        }
+      }
+    }
+  }
 
-    EXPECT_EQ_ARRAY2D(actual, expected);
+  static void test(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h,
+                   const float transform[9], size_t channels, size_t padding,
+                   void (*initializer)(test::Array2D<ScalarType> &) =
+                       sequential_initializer) {
+    test_stripe(src_w, src_h, dst_w, dst_h, 0, dst_h, transform, channels,
+                padding, initializer);
   }
 
   static void test_special_source(
       size_t src_w, size_t src_h, size_t src_stride, size_t dst_w, size_t dst_h,
       const float transform[9], size_t channels,
-      void (*initializer)(test::Array2D<ScalarType> &) = random_initializer) {
+      void (*initializer)(test::Array2D<ScalarType> &) =
+          sequential_initializer) {
     size_t src_total_width = channels * src_w;
     size_t dst_total_width = channels * dst_w;
 
@@ -88,7 +142,7 @@ class WarpPerspective : public testing::Test {
 
     actual.fill(42);
 
-    calculate_expected(source, transform, expected);
+    calculate_expected(source, 0, expected.height(), transform, expected);
 
     ASSERT_EQ(KLEIDICV_OK,
               kleidicv_warp_perspective_u8(
@@ -97,14 +151,14 @@ class WarpPerspective : public testing::Test {
                   channels, KLEIDICV_INTERPOLATION_NEAREST,
                   KLEIDICV_BORDER_TYPE_REPLICATE, {}));
 
-    EXPECT_EQ_ARRAY2D(actual, expected);
+    EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected);
   }
 
  private:
-  static void calculate_expected(test::Array2D<ScalarType> &src,
-                                 const float transform[9],
+  static void calculate_expected(test::Array2D<ScalarType> &src, size_t y_begin,
+                                 size_t y_end, const float transform[9],
                                  test::Array2D<ScalarType> &expected) {
-    for (size_t y = 0; y < expected.height(); ++y) {
+    for (size_t y = y_begin; y < y_end; ++y) {
       for (size_t x = 0; x < expected.width() / src.channels(); ++x) {
         float dx = static_cast<float>(x), dy = static_cast<float>(y);
 
@@ -128,9 +182,9 @@ class WarpPerspective : public testing::Test {
 };
 
 using WarpPerspectiveElementTypes = ::testing::Types<uint8_t>;
-TYPED_TEST_SUITE(WarpPerspective, WarpPerspectiveElementTypes);
+TYPED_TEST_SUITE(WarpPerspectiveNearest, WarpPerspectiveElementTypes);
 
-TYPED_TEST(WarpPerspective, IdentityNoPadding) {
+TYPED_TEST(WarpPerspectiveNearest, IdentityNoPadding) {
   size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
   size_t src_h = 4;
   size_t dst_w = src_w;
@@ -138,7 +192,7 @@ TYPED_TEST(WarpPerspective, IdentityNoPadding) {
   TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1, 0);
 }
 
-TYPED_TEST(WarpPerspective, TransposeNoPadding) {
+TYPED_TEST(WarpPerspectiveNearest, TransposeNoPadding) {
   size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
   size_t src_h = 4;
   size_t dst_w = src_w;
@@ -146,7 +200,7 @@ TYPED_TEST(WarpPerspective, TransposeNoPadding) {
   TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1, 0);
 }
 
-TYPED_TEST(WarpPerspective, SmallPadding) {
+TYPED_TEST(WarpPerspectiveNearest, SmallPadding) {
   size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
   size_t src_h = 4;
   size_t dst_w = src_w;
@@ -154,7 +208,7 @@ TYPED_TEST(WarpPerspective, SmallPadding) {
   TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1, 13);
 }
 
-TYPED_TEST(WarpPerspective, Upscale) {
+TYPED_TEST(WarpPerspectiveNearest, Upscale) {
   // clang-format off
   const float transform_upscale[] = {
     0.2, 0.05, 0.1,
@@ -170,7 +224,7 @@ TYPED_TEST(WarpPerspective, Upscale) {
   TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3);
 }
 
-TYPED_TEST(WarpPerspective, RandomTransform) {
+TYPED_TEST(WarpPerspectiveNearest, RandomTransform) {
   float transform[9];
   test::PseudoRandomNumberGenerator<float> generator;
   for (size_t i = 0; i < 9; ++i) {
@@ -184,7 +238,7 @@ TYPED_TEST(WarpPerspective, RandomTransform) {
   TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, 19);
 }
 
-TYPED_TEST(WarpPerspective, DivisionByZero) {
+TYPED_TEST(WarpPerspectiveNearest, DivisionByZero) {
   // clang-format off
   const float transform_div_by_zero[] = {
     1, 0, 0,
@@ -200,38 +254,100 @@ TYPED_TEST(WarpPerspective, DivisionByZero) {
   TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1, 3);
 }
 
-static const size_t big_width = 1ULL << 17, big_height = 1ULL << 17;
-static const size_t part_width = 16, part_height = 16;
+static const size_t kBigWidth = 1ULL << 17, kBigHeight = 1ULL << 17;
+static const size_t kPartWidth = 21, kPartHeight = 21;
 
 template <class ScalarType>
 static void part_initializer(test::Array2D<ScalarType> &source) {
   ScalarType counter = 0;
-  for (size_t y = big_height; y < big_height + part_height; ++y) {
-    for (size_t x = big_width; x < big_width + part_width; ++x) {
+  for (size_t y = kBigHeight; y < kBigHeight + kPartHeight; ++y) {
+    for (size_t x = kBigWidth; x < kBigWidth + kPartWidth; ++x) {
       *source.at(y, x) = ++counter;
     }
   }
 }
 
-TYPED_TEST(WarpPerspective, BigSourceImage) {
+TYPED_TEST(WarpPerspectiveNearest, BigSourceImage) {
   // clang-format off
   const float transform_cut[] = {
-    1, 0, 1ULL<<17,
-    0, 1, 1ULL<<17,
+    1, 0, kBigWidth,
+    0, 1, kBigHeight,
     0, 0, 1
   };
   // clang-format on
 
   // Let stride * height be bigger than 1 << 32
-  size_t dst_w = part_width;
-  size_t dst_h = part_height;
-  size_t src_w = big_width + part_width;
-  size_t src_h = big_height + part_height;
+  size_t dst_w = kPartWidth;
+  size_t dst_h = kPartHeight;
+  size_t src_w = kBigWidth + kPartWidth;
+  size_t src_h = kBigHeight + kPartHeight;
 
   TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, 0,
                     part_initializer);
 }
 
+TYPED_TEST(WarpPerspectiveNearest, BigWidthDestination) {
+  // clang-format off
+  const float transform_upscale[] = {
+    0.0011, 0, 0.3,
+    0, 1.02, -0.1,
+    0, 0, 1
+  };
+  // clang-format on
+
+  size_t dst_w = kBigWidth;
+  size_t dst_h = 1;
+  size_t src_w = dst_w / 1000;
+  size_t src_h = dst_h;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3);
+}
+
+static const size_t kHugeHeight = (1ULL << 24) - 1;
+TYPED_TEST(WarpPerspectiveNearest, BigHeightDestination) {
+  // clang-format off
+  const float transform_upscale[] = {
+    1.03, 0, 0.1,
+    0, 0.00011, -0.3,
+    0, 0, 1
+  };
+  // clang-format on
+
+  size_t dst_w = 17;
+  size_t dst_h = kHugeHeight;
+  size_t src_w = dst_w;
+  size_t src_h = dst_h / 10000;
+  TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight,
+                           dst_h, transform_upscale, 1, 0);
+}
+
+template <class ScalarType>
+static void huge_height_part_initializer(test::Array2D<ScalarType> &source) {
+  ScalarType counter = 0;
+  for (size_t y = kHugeHeight - kPartHeight; y < kHugeHeight; ++y) {
+    for (size_t x = 0; x < 17; ++x) {
+      *source.at(y, x) = ++counter;
+    }
+  }
+}
+
+TYPED_TEST(WarpPerspectiveNearest, HugeHeightSourceAndDestination) {
+  // clang-format off
+  const float transform[] = {
+    1.03, 0, 0.1,
+    0, 0.9993, -0.3,
+    0, 0, 1
+  };
+  // clang-format on
+
+  size_t dst_w = 17;
+  size_t dst_h = (1ULL << 24) - 1;
+  size_t src_w = dst_w;
+  size_t src_h = dst_h;
+  TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - 16, dst_h,
+                           transform, 1, 0,
+                           huge_height_part_initializer<TypeParam>);
+}
+
 static const size_t oneline_big_width = 1ULL << 23;
 static const size_t oneline_part_width = 16, oneline_part_offset = 1ULL << 17,
                     oneline_dst_height = 16;
@@ -245,7 +361,7 @@ static void oneline_part_initializer(test::Array2D<ScalarType> &source) {
   }
 }
 
-TYPED_TEST(WarpPerspective, OneLineBigSourceImage) {
+TYPED_TEST(WarpPerspectiveNearest, OneLineBigSourceImage) {
   // clang-format off
   const float tr[] = {
     1, 0, oneline_part_offset,
@@ -266,7 +382,7 @@ TYPED_TEST(WarpPerspective, OneLineBigSourceImage) {
                                    tr, 1, oneline_part_initializer);
 }
 
-TYPED_TEST(WarpPerspective, OneLineSmallSourceImage) {
+TYPED_TEST(WarpPerspectiveNearest, OneLineSmallSourceImage) {
   // clang-format off
   const float tr[] = {
     -1, 0, 1,
@@ -283,7 +399,7 @@ TYPED_TEST(WarpPerspective, OneLineSmallSourceImage) {
   TestFixture::test_special_source(src_w, src_h, src_h, dst_w, dst_h, tr, 1);
 }
 
-TYPED_TEST(WarpPerspective, NullPointer) {
+TYPED_TEST(WarpPerspectiveNearest, NullPointer) {
   const TypeParam src[4] = {};
   TypeParam dst[8];
   test::test_null_args(kleidicv_warp_perspective_u8, src, 2, 2, 2, dst, 8, 8, 1,
@@ -291,7 +407,7 @@ TYPED_TEST(WarpPerspective, NullPointer) {
                        KLEIDICV_BORDER_TYPE_REPLICATE, nullptr);
 }
 
-TYPED_TEST(WarpPerspective, ZeroImageSize) {
+TYPED_TEST(WarpPerspectiveNearest, ZeroImageSize) {
   const TypeParam src[1] = {};
   TypeParam dst[1];
 
@@ -307,7 +423,7 @@ TYPED_TEST(WarpPerspective, ZeroImageSize) {
                 nullptr));
 }
 
-TYPED_TEST(WarpPerspective, InvalidImageSize) {
+TYPED_TEST(WarpPerspectiveNearest, InvalidImageSize) {
   const TypeParam src[1] = {};
   TypeParam dst[8];
 
@@ -338,7 +454,7 @@ TYPED_TEST(WarpPerspective, InvalidImageSize) {
                 nullptr));
 }
 
-TYPED_TEST(WarpPerspective, UnsupportedTwoChannels) {
+TYPED_TEST(WarpPerspectiveNearest, UnsupportedTwoChannels) {
   const TypeParam src[1] = {};
   TypeParam dst[8];
 
@@ -349,7 +465,7 @@ TYPED_TEST(WarpPerspective, UnsupportedTwoChannels) {
                 nullptr));
 }
 
-TYPED_TEST(WarpPerspective, UnsupportedBorderTypeConst) {
+TYPED_TEST(WarpPerspectiveNearest, UnsupportedBorderTypeConst) {
   const TypeParam src[1] = {};
   TypeParam dst[8];
 
@@ -360,7 +476,7 @@ TYPED_TEST(WarpPerspective, UnsupportedBorderTypeConst) {
                                          KLEIDICV_BORDER_TYPE_CONSTANT, src));
 }
 
-TYPED_TEST(WarpPerspective, UnsupportedTooSmallImage) {
+TYPED_TEST(WarpPerspectiveNearest, UnsupportedTooSmallImage) {
   const TypeParam src[1] = {};
   TypeParam dst[8];
 
@@ -371,47 +487,390 @@ TYPED_TEST(WarpPerspective, UnsupportedTooSmallImage) {
                 nullptr));
 }
 
-TYPED_TEST(WarpPerspective, UnsupportedInterpolation) {
+TYPED_TEST(WarpPerspectiveNearest, UnsupportedBigStride) {
   const TypeParam src[1] = {};
   TypeParam dst[8];
 
-  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
             kleidicv_warp_perspective_u8(
-                src, 1, 1, 1, dst, 8, 8, 1, transform_identity, 1,
-                KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE,
+                src, 1UL << 32, 1, 1, dst, 8, 8, 1, transform_identity, 1,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
                 nullptr));
 }
 
-TYPED_TEST(WarpPerspective, UnsupportedBigStride) {
+TYPED_TEST(WarpPerspectiveNearest, UnsupportedBigSourceHeight) {
   const TypeParam src[1] = {};
   TypeParam dst[8];
 
   EXPECT_EQ(KLEIDICV_ERROR_RANGE,
             kleidicv_warp_perspective_u8(
-                src, 1UL << 32, 1, 1, dst, 8, 8, 1, transform_identity, 1,
+                src, 1, 1, 1UL << 24, dst, 8, 8, 1, transform_identity, 1,
                 KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
                 nullptr));
 }
 
-TYPED_TEST(WarpPerspective, UnsupportedBigHeight) {
+TYPED_TEST(WarpPerspectiveNearest, UnsupportedBigSourceWidth) {
   const TypeParam src[1] = {};
   TypeParam dst[8];
 
   EXPECT_EQ(KLEIDICV_ERROR_RANGE,
             kleidicv_warp_perspective_u8(
-                src, 1, 1, 1UL << 24, dst, 8, 8, 1, transform_identity, 1,
+                src, 1, 1UL << 24, 1, dst, 8, 8, 1, transform_identity, 1,
                 KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
                 nullptr));
 }
 
-TYPED_TEST(WarpPerspective, UnsupportedBigWidth) {
+TYPED_TEST(WarpPerspectiveNearest, UnsupportedBigDestinationHeight) {
   const TypeParam src[1] = {};
   TypeParam dst[8];
 
   EXPECT_EQ(KLEIDICV_ERROR_RANGE,
             kleidicv_warp_perspective_u8(
-                src, 1, 1UL << 24, 1, dst, 8, 8, 1, transform_identity, 1,
+                src, 1, 1, 1, dst, 8, 8, 1UL << 24, transform_identity, 1,
                 KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
                 nullptr));
 }
-#endif  // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE
+
+TYPED_TEST(WarpPerspectiveNearest, UnsupportedBigDestinationWidth) {
+  const TypeParam src[1] = {};
+  TypeParam dst[8];
+
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            kleidicv_warp_perspective_u8(
+                src, 1, 1, 1, dst, 1UL << 24, 1UL << 24, 1, transform_identity,
+                1, KLEIDICV_INTERPOLATION_NEAREST,
+                KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+}
+
+/////////////////////////////////////////////////////////////
+//       WarpPerspective with Linear Interpolation
+/////////////////////////////////////////////////////////////
+
+template <class ScalarType>
+class WarpPerspectiveLinear : public testing::Test {
+ public:
+  static void test_stripe(size_t src_w, size_t src_h, size_t dst_w,
+                          size_t dst_h, size_t y_begin, size_t y_end,
+                          const float transform[9], size_t channels,
+                          size_t padding,
+                          void (*initializer)(test::Array2D<ScalarType> &) =
+                              sequential_initializer) {
+    size_t src_total_width = channels * src_w;
+    size_t dst_total_width = channels * dst_w;
+
+    test::Array2D<ScalarType> source{src_total_width, src_h, padding, channels};
+    test::Array2D<ScalarType> actual{dst_total_width, dst_h, padding, channels};
+    test::Array2D<ScalarType> expected{dst_total_width, dst_h, padding,
+                                       channels};
+
+    initializer(source);
+    for (size_t row = y_begin; row < y_end; ++row) {
+      for (size_t column = 0; column < actual.width(); ++column) {
+        *actual.at(row, column) = 42;
+      }
+    }
+
+    calculate_expected(source, y_begin, y_end, transform, expected);
+
+    ASSERT_EQ(
+        KLEIDICV_OK,
+        kleidicv_warp_perspective_stripe_u8(
+            source.data(), source.stride(), source.width(), source.height(),
+            actual.data(), actual.stride(), actual.width(), actual.height(),
+            y_begin, y_end, transform, channels, KLEIDICV_INTERPOLATION_LINEAR,
+            KLEIDICV_BORDER_TYPE_REPLICATE, {}));
+
+    ScalarType threshold = 1;
+    for (size_t row = y_begin; row < y_end; ++row) {
+      for (size_t column = 0; column < expected.width(); ++column) {
+        const ScalarType *lhs = expected.at(row, column);
+        const ScalarType *rhs = actual.at(row, column);
+        if (!lhs || !rhs) {
+          GTEST_FAIL() << "Nullptr at (row=" << row << ", column=" << column
+                       << "): " << reinterpret_cast<const void *>(lhs) << " , "
+                       << reinterpret_cast<const void *>(lhs) << std::endl;
+        }
+        ScalarType error;
+        if constexpr (std::is_integral<ScalarType>::value) {
+          error = static_cast<ScalarType>(
+              abs(static_cast<int64_t>(lhs[0]) - static_cast<int64_t>(rhs[0])));
+        } else {
+          error = std::abs(lhs[0] - rhs[0]);
+        }
+        if (error > threshold) {
+          GTEST_FAIL() << "Mismatch at (row=" << row << ", col=" << column
+                       << "): " << +(expected).at(row, column)[0] << " vs "
+                       << +(actual).at(row, column)[0] << "." << std::endl;
+        }
+      }
+    }
+  }
+
+  static void test(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h,
+                   const float transform[9], size_t channels, size_t padding,
+                   void (*initializer)(test::Array2D<ScalarType> &) =
+                       sequential_initializer) {
+    test_stripe(src_w, src_h, dst_w, dst_h, 0, dst_h, transform, channels,
+                padding, initializer);
+  }
+
+ private:
+  static void calculate_expected(test::Array2D<ScalarType> &src, size_t y_begin,
+                                 size_t y_end, const float transform[9],
+                                 test::Array2D<ScalarType> &expected) {
+    for (size_t y = y_begin; y < y_end; ++y) {
+      for (size_t x = 0; x < expected.width() / src.channels(); ++x) {
+        double dx = static_cast<double>(x), dy = static_cast<double>(y);
+        double tw = transform[6] * dx + transform[7] * dy + transform[8];
+        double inv_w = 1.F / tw;
+        double tx = transform[0] * dx + transform[1] * dy + transform[2];
+        double ty = transform[3] * dx + transform[4] * dy + transform[5];
+        double fx = inv_w * tx;
+        double fy = inv_w * ty;
+        uint32_t ix0 = static_cast<uint32_t>(
+            std::max(0.0, std::min(fx, src.width() - 1.0)));
+        uint32_t iy0 = static_cast<uint32_t>(
+            std::max(0.0, std::min(fy, src.height() - 1.0)));
+        uint32_t ix1 = (ix0 < src.width() - 1 && fx >= 0) ? ix0 + 1 : ix0;
+        uint32_t iy1 = (iy0 < src.height() - 1 && fy >= 0) ? iy0 + 1 : iy0;
+        double xfrac =
+            (ix0 < src.width() - 1 && fx >= 0) ? fx - floor(fx) : 0.0;
+        double yfrac =
+            (iy0 < src.height() - 1 && fy >= 0) ? fy - floor(fy) : 0.0;
+        for (size_t ch = 0; ch < src.channels(); ++ch) {
+          double a = *src.at(iy0, ix0 * src.channels() + ch);
+          double b = *src.at(iy0, ix1 * src.channels() + ch);
+          double c = *src.at(iy1, ix0 * src.channels() + ch);
+          double d = *src.at(iy1, ix1 * src.channels() + ch);
+          double line1 = (b - a) * xfrac + a;
+          double line2 = (d - c) * xfrac + c;
+          double double_result = (line2 - line1) * yfrac + line1;
+          *expected.at(y, x * src.channels() + ch) =
+              static_cast<ScalarType>(lround(double_result));
+        }
+      }
+    }
+  }
+};
+
+TYPED_TEST_SUITE(WarpPerspectiveLinear, WarpPerspectiveElementTypes);
+
+TYPED_TEST(WarpPerspectiveLinear, SimpleResize) {
+  const size_t src_w = 16;
+  const size_t src_h = 4;
+  const size_t dst_w = 17;
+  const size_t dst_h = 5;
+  const size_t padding = 1;
+
+  test::Array2D<TypeParam> source(src_w, src_h, padding, 1);
+  source.fill(0);
+  test::Array2D<TypeParam> actual(dst_w, dst_h, padding, 1);
+  actual.fill(0);
+  test::Array2D<TypeParam> expected(dst_w, dst_h, padding, 1);
+  expected.fill(0);
+
+  // clang-format off
+  source.set(0, 0, { 0, 100,   0, 100,   0, 100,   0, 100,   0, 100,   0, 100,   0, 100,   0, 100 });
+  source.set(1, 0, { 0, 100,   0, 100,   0, 100,   0, 100,   0, 100,   0, 100,   0, 100,   0, 100});
+  source.set(2, 0, { 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100});
+
+  // first line: y = -0.3, so only the first line counts. x = [0, 0.9, 1.8, 2.7, 3.6, ...]
+  expected.set(0, 0, {  0,  90,  20,  70,  40,  50,  60,  30,  80,  10, 100,  10,  80,  30,  60,  50,  40 });
+  // y = 1.2,  x = [-0.1, 0.8, 1.7, 2.6, 3.5, ...]
+  expected.set(1, 0, { 20,  84,  44,  68,  60,  52,  76,  36,  92,  20,  92,  36,  76,  52,  60,  68,  44 });
+  // y = 2.7,  x = [-0.2, 0.7, 1.6, 2.5, ...]
+  expected.set(2, 0, { 30,  30,  30,  30,  30,  30,  30,  30,  30,  30,  30,  30,  30,  30,  30,  30,  30 });
+  // y = 4.2 and 5.7: all zeroes, x does not count
+
+  const float little_scale[] = {
+    0.9, -0.1, 0,
+    0, 1.5, -0.3,
+    0, 0, 1
+  };
+  // clang-format on
+
+  EXPECT_EQ(KLEIDICV_OK,
+            kleidicv_warp_perspective_u8(
+                source.data(), source.stride(), source.width(), source.height(),
+                actual.data(), actual.stride(), actual.width(), actual.height(),
+                little_scale, 1, KLEIDICV_INTERPOLATION_LINEAR,
+                KLEIDICV_BORDER_TYPE_REPLICATE, nullptr));
+
+  EXPECT_EQ_ARRAY2D(actual, expected);
+}
+
+TYPED_TEST(WarpPerspectiveLinear, IdentityNoPadding) {
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1, 0);
+}
+
+TYPED_TEST(WarpPerspectiveLinear, TransposeNoPadding) {
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1, 0);
+}
+
+TYPED_TEST(WarpPerspectiveLinear, SmallPadding) {
+  size_t src_w = 5 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1, 13);
+}
+
+TYPED_TEST(WarpPerspectiveLinear, Upscale) {
+  // clang-format off
+  const float transform_upscale[] = {
+    0.2, 0.05, 0.1,
+    0.02, 0.2, -0.1,
+    0.0001, 0.0001, 1.1
+  };
+  // clang-format on
+
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w * 3;
+  size_t dst_h = src_h * 2;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3);
+}
+
+TYPED_TEST(WarpPerspectiveLinear, RandomTransform) {
+  float transform[9];
+  // Not entirely random, as very small and very big floats (in absolute value)
+  // cause too big errors and they are far from being valid use cases anyway
+  test::PseudoRandomNumberGeneratorIntRange<size_t> exponentGenerator(-10, 10);
+  test::PseudoRandomNumberGeneratorFloatRange<float> mantissaGenerator(-1.0,
+                                                                       1.0);
+  for (size_t cc = 0; cc < 100; ++cc) {
+    for (size_t i = 0; i < 9; ++i) {
+      transform[i] =
+          mantissaGenerator.next().value_or(1.0) *
+          static_cast<float>(
+              exp(static_cast<double>(exponentGenerator.next().value_or(1.0))));
+    }
+
+    size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+    size_t src_h = 4;
+    size_t dst_w = src_w;
+    size_t dst_h = src_h;
+    TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, 19);
+  }
+}
+
+TYPED_TEST(WarpPerspectiveLinear, DivisionByZero) {
+  // clang-format off
+  const float transform_div_by_zero[] = {
+    1, 0, 0,
+    0, 1, 0,
+    1, 1, -3
+  };
+  // clang-format on
+
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1, 3);
+}
+
+template <class ScalarType>
+static void crisscross_initializer(test::Array2D<ScalarType> &source) {
+  for (int64_t y = 0; y < static_cast<int64_t>(source.height()); ++y) {
+    for (int64_t x = 0; x < static_cast<int64_t>(source.width()); ++x) {
+      *source.at(y, x) =
+          ((x % 10 == 0 && x > 0) || (y % 10 == 0 && y > 0)) ? 0 : 255;
+    }
+  }
+}
+
+TYPED_TEST(WarpPerspectiveLinear, RoughSource) {
+  // clang-format off
+  const float transformation[] = {
+  -0.14721069682646492627, 0.0039219946625178928046, -0.23197875743643098234,
+   -9.6823083601201584969, 0.27678993497443726834, -24.722070405199925602,
+   0.043781423781792741523, 0.0021065152608927420821, -0.5757716849053092778
+  };
+  // clang-format on
+
+  size_t src_w = 13;
+  size_t src_h = 634;
+  size_t dst_w = 17;
+  size_t dst_h = 852;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transformation, 1, 13,
+                    crisscross_initializer);
+}
+
+TYPED_TEST(WarpPerspectiveLinear, BigSourceImage) {
+  // clang-format off
+  const float transform_cut[] = {
+    1, 0, 1ULL<<17,
+    0, 1, 1ULL<<17,
+    0, 0, 1
+  };
+  // clang-format on
+
+  // Let stride * height be bigger than 1 << 32
+  size_t dst_w = kPartWidth;
+  size_t dst_h = kPartHeight;
+  size_t src_w = kBigWidth + kPartWidth + 3;
+  size_t src_h = kBigHeight + kPartHeight;
+
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, 0,
+                    part_initializer);
+}
+
+TYPED_TEST(WarpPerspectiveLinear, BigWidthDestination) {
+  // clang-format off
+  const float transform_upscale[] = {
+    0.0011, 0, 0.3,
+    0, 1.02, -0.1,
+    0, 0, 1
+  };
+  // clang-format on
+
+  size_t dst_w = kBigWidth;
+  size_t dst_h = 1;
+  size_t src_w = dst_w / 1000;
+  size_t src_h = dst_h;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3);
+}
+
+TYPED_TEST(WarpPerspectiveLinear, BigHeightDestination) {
+  // clang-format off
+  const float transform_upscale[] = {
+    1.03, 0, 0.1,
+    0, 0.00011, -0.3,
+    0, 0, 1
+  };
+  // clang-format on
+
+  size_t dst_w = 17;
+  size_t dst_h = kHugeHeight;
+  size_t src_w = dst_w;
+  size_t src_h = dst_h / 10000;
+  TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight,
+                           dst_h, transform_upscale, 1, 0);
+}
+
+TYPED_TEST(WarpPerspectiveLinear, HugeHeightSourceAndDestination) {
+  // clang-format off
+  const float transform[] = {
+    1.03, 0, 0.1,
+    0, 0.9993, -0.3,
+    0, 0, 1
+  };
+  // clang-format on
+
+  size_t dst_w = 17;
+  size_t dst_h = (1ULL << 24) - 1;
+  size_t src_w = dst_w;
+  size_t src_h = dst_h;
+  TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight,
+                           dst_h, transform, 1, 0,
+                           huge_height_part_initializer<TypeParam>);
+}
diff --git a/test/framework/array.h b/test/framework/array.h
index 60bff94ac..cabdd9f4a 100644
--- a/test/framework/array.h
+++ b/test/framework/array.h
@@ -197,20 +197,29 @@ class Array2D : public TwoDimensional<ElementType> {
     }
   }
 
-  // Compares two instances for equality considering only element bytes.
-  // Returns the location of the first mismatch, if any.
+  // Compares two instances for (near) equality considering its content
+  // (elements). Returns the location of the first mismatch, if any.
   std::optional<std::tuple<size_t, size_t>> compare_to(
-      const Array2D<ElementType> &other) const {
+      const Array2D<ElementType> &other, ElementType threshold = 0) const {
     for (size_t row = 0; row < height(); ++row) {
       for (size_t column = 0; column < width(); ++column) {
         const ElementType *lhs = at(row, column);
         const ElementType *rhs = other.at(row, column);
-        if (!lhs || !rhs || (lhs[0] != rhs[0])) {
+        if (!lhs || !rhs) {
+          return std::make_tuple(row, column);
+        }
+        ElementType error;
+        if constexpr (std::is_integral<ElementType>::value) {
+          error = static_cast<ElementType>(
+              abs(static_cast<int64_t>(lhs[0]) - static_cast<int64_t>(rhs[0])));
+        } else {
+          error = std::abs(lhs[0] - rhs[0]);
+        }
+        if (error > threshold) {
           return std::make_tuple(row, column);
         }
       }
     }
-
     return std::nullopt;
   }
 
@@ -235,8 +244,8 @@ class Array2D : public TwoDimensional<ElementType> {
   // Returns true if this object holds actual memory, otherwise false.
   bool valid() const { return data() != nullptr; }
 
-  // Returns a pointer to a data element at a given row and column position, or
-  // nullptr if the requested position is invalid.
+  // Returns a pointer to a data element at a given row and column position,
+  // or nullptr if the requested position is invalid.
   ElementType *at(size_t row, size_t column) override {
     return const_cast<ElementType *>(
         const_cast<const Array2D<ElementType> *>(this)->at(row, column));
@@ -255,8 +264,8 @@ class Array2D : public TwoDimensional<ElementType> {
   }
 
  private:
-  // Returns the number of elements between the end of one row and the start of
-  // the next row.
+  // Returns the number of elements between the end of one row and the start
+  // of the next row.
   size_t padding() { return stride() / sizeof(ElementType) - width(); }
 
   // Returns the offset to the first padding byte within a row.
@@ -336,9 +345,9 @@ class Array2D : public TwoDimensional<ElementType> {
       buffer_ =
           std::unique_ptr<ElementType[]>(new ElementType[allocation_count]);
       // Weaken alignment to flush out potential alignment issues.
-      // buffer_.get() will contain a pointer that is at least 16-byte aligned.
-      // By adding a small offset to that we get a pointer that is only aligned
-      // to sizeof(ElementType).
+      // buffer_.get() will contain a pointer that is at least 16-byte
+      // aligned. By adding a small offset to that we get a pointer that is
+      // only aligned to sizeof(ElementType).
       data_ = buffer_.get() + 1;
     } catch (...) {
       reset();
@@ -352,8 +361,8 @@ class Array2D : public TwoDimensional<ElementType> {
 
   // Smart pointer to the managed memory.
   std::unique_ptr<ElementType[]> buffer_;
-  // Pointer to the start of the data. This is offset from the start of buffer_
-  // to flush out potential alignment issues.
+  // Pointer to the start of the data. This is offset from the start of
+  // buffer_ to flush out potential alignment issues.
   ElementType *data_{nullptr};
   // Width a row in the array.
   size_t width_{0};
@@ -365,9 +374,9 @@ class Array2D : public TwoDimensional<ElementType> {
   size_t stride_{0};
 };  // end of class Array2D<ElementType>
 
-// Compares two Array2D objects for equality.
+// Compares two Array2D objects for (near-)equality.
 // Unary + is used to ensure values are printed as integers, not chars
-#define EXPECT_EQ_ARRAY2D(lhs, rhs)                                 \
+#define EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(threshold, lhs, rhs)       \
   do {                                                              \
     ASSERT_EQ((lhs).width(), (rhs).width())                         \
         << "Mismatch in width." << std::endl;                       \
@@ -375,7 +384,7 @@ class Array2D : public TwoDimensional<ElementType> {
         << "Mismatch in height." << std::endl;                      \
     ASSERT_EQ((lhs).channels(), (rhs).channels())                   \
         << "Mismatch in channels." << std::endl;                    \
-    auto mismatch = (lhs).compare_to((rhs));                        \
+    auto mismatch = (lhs).compare_to((rhs), (threshold));           \
     if (mismatch) {                                                 \
       auto [row, col] = *mismatch;                                  \
       GTEST_FAIL() << "Mismatch at (row=" << row << ", col=" << col \
@@ -385,6 +394,10 @@ class Array2D : public TwoDimensional<ElementType> {
     }                                                               \
   } while (0 != 0)
 
+// Compares two Array2D objects for equality.
+#define EXPECT_EQ_ARRAY2D(lhs, rhs) \
+  EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(0, (lhs), (rhs))
+
 // Compares two Array2D objects for inequality.
 #define EXPECT_NE_ARRAY2D(lhs, rhs)                                \
   do {                                                             \
-- 
GitLab