From 9f03e66eeeea3613a13d2c5f67a4fb382dc33939 Mon Sep 17 00:00:00 2001
From: Denes Tarjan <denes.tarjan@arm.com>
Date: Fri, 18 Oct 2024 11:10:14 +0000
Subject: [PATCH] Implement warpPerspective for NEON, nearest neighbour

---
 CHANGELOG.md                                  |   2 +
 adapters/opencv/kleidicv_hal.cpp              |  63 +++-
 adapters/opencv/kleidicv_hal.h                |  21 ++
 conformity/opencv/test_warp_perspective.cpp   |  75 +++++
 conformity/opencv/tests.cpp                   |   1 +
 conformity/opencv/tests.h                     |   1 +
 doc/functionality.md                          |   5 +
 doc/opencv.md                                 |  10 +
 kleidicv/include/kleidicv/ctypes.h            |   8 +
 kleidicv/include/kleidicv/kleidicv.h          |  40 +++
 kleidicv/include/kleidicv/neon.h              |  16 +-
 kleidicv/include/kleidicv/neon_intrinsics.h   |  15 +-
 .../kleidicv/transform/warp_perspective.h     |  60 ++++
 .../src/transform/warp_perspective_api.cpp    |  33 ++
 .../src/transform/warp_perspective_neon.cpp   | 168 ++++++++++
 .../include/kleidicv_thread/kleidicv_thread.h |  12 +
 kleidicv_thread/src/kleidicv_thread.cpp       |  22 ++
 scripts/benchmark/benchmarks.txt              |   2 +
 scripts/ci-opencv.sh                          |   2 +
 test/api/test_thread.cpp                      |  87 +++++
 test/api/test_warp_perspective.cpp            | 305 ++++++++++++++++++
 21 files changed, 941 insertions(+), 7 deletions(-)
 create mode 100644 conformity/opencv/test_warp_perspective.cpp
 create mode 100644 kleidicv/include/kleidicv/transform/warp_perspective.h
 create mode 100644 kleidicv/src/transform/warp_perspective_api.cpp
 create mode 100644 kleidicv/src/transform/warp_perspective_neon.cpp
 create mode 100644 test/api/test_warp_perspective.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a99014137..28ab92979 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,8 @@ This changelog aims to follow the guiding principles of
 - Remap implementation
   - 2-channel s16 and s16+u16 fixed-point coordinates and 1-channel u8 input.
   - 2-channel s16 + 5+5 bits' fractions fixed-point coordinates and 1-channel u8 input.
+- WarpPerspective implementation
+  - Nearest neighbour implementation for replicated borders and 1-channel u8 input.
 - Implementation for cv::pyrDown in the OpenCV HAL.
 - Sum implementation for 1-channel f32 input (not exposed to OpenCV)
 
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index 99b2b10af..3953a2b63 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -306,7 +306,7 @@ int threshold(const uchar *src_data, size_t src_step, uchar *dst_data,
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-// Converts an OpenCV border type to an KleidiCV border type.
+// Converts an OpenCV border type to a KleidiCV border type.
 static int from_opencv(int opencv_border_type,
                        kleidicv_border_type_t &border_type) {
   switch (opencv_border_type) {
@@ -338,6 +338,25 @@ static int from_opencv(int opencv_border_type,
   return 0;
 }
 
+// Converts an OpenCV interpolation type to a KleidiCV interpolation type.
+static int from_opencv(int opencv_interpolation,
+                       kleidicv_interpolation_type_t &interpolation_type) {
+  switch (opencv_interpolation) {
+    default:
+      return 1;
+    case CV_HAL_INTER_NEAREST:
+      interpolation_type =
+          kleidicv_interpolation_type_t::KLEIDICV_INTERPOLATION_NEAREST;
+      break;
+    case CV_HAL_INTER_LINEAR:
+      interpolation_type =
+          kleidicv_interpolation_type_t::KLEIDICV_INTERPOLATION_LINEAR;
+      break;
+  }
+
+  return 0;
+}
+
 struct SeparableFilter2DParams {
   size_t channels;
   kleidicv_border_type_t border_type;
@@ -1354,4 +1373,46 @@ int scharr_deriv(const uchar *src_data, size_t src_step, int16_t *dst_data,
       src, src_step, width + 2, height + 2, cn, dst_data, dst_step, mt));
 }
 
+int warp_perspective(int src_type, const uchar *src_data, size_t src_step,
+                     int src_width, int src_height, uchar *dst_data,
+                     size_t dst_step, int dst_width, int dst_height,
+                     const double transformation[9], int interpolation,
+                     int border_type, const double border_value[4]) {
+  kleidicv_border_type_t kleidicv_border_type;
+  if (from_opencv(border_type, kleidicv_border_type)) {
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+  }
+
+  float float_transformation[9];
+  for (size_t i = 0; i < 9; ++i) {
+    float_transformation[i] = static_cast<float>(transformation[i]);
+  }
+
+  kleidicv_interpolation_type_t kleidicv_interpolation_type;
+  if (from_opencv(interpolation, kleidicv_interpolation_type)) {
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+  }
+
+  kleidicv_border_values_t border_values = {};
+  if (kleidicv_border_type ==
+      kleidicv_border_type_t::KLEIDICV_BORDER_TYPE_CONSTANT) {
+    border_values.top = border_value[0];
+    border_values.left = border_value[1];
+    border_values.bottom = border_value[2];
+    border_values.right = border_value[3];
+  }
+  auto mt = get_multithreading();
+
+  if (src_type == CV_8UC1) {
+    return convert_error(kleidicv_thread_warp_perspective_u8(
+        src_data, src_step, static_cast<size_t>(src_width),
+        static_cast<size_t>(src_height), dst_data, dst_step,
+        static_cast<size_t>(dst_width), static_cast<size_t>(dst_height),
+        float_transformation, 1, kleidicv_interpolation_type,
+        kleidicv_border_type, border_values, mt));
+  }
+
+  return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
 }  // namespace kleidicv::hal
diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h
index 2d34f27f9..c8bac7902 100644
--- a/adapters/opencv/kleidicv_hal.h
+++ b/adapters/opencv/kleidicv_hal.h
@@ -153,8 +153,15 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step,
                     const uint16_t *mapfrac, size_t mapfrac_step,
                     int border_type, const double border_value[4]);
 
+int warp_perspective(int src_type, const uchar *src_data, size_t src_step,
+                     int src_width, int src_height, uchar *dst_data,
+                     size_t dst_step, int dst_width, int dst_height,
+                     const double transformation[9], int interpolation,
+                     int borderType, const double borderValue[4]);
+
 int scharr_deriv(const uchar *src_data, size_t src_step, int16_t *dst_data,
                  size_t dst_step, int width, int height, int cn);
+
 }  // namespace hal
 }  // namespace kleidicv
 
@@ -418,6 +425,20 @@ static inline int kleidicv_pyrdown_with_fallback(
 #undef cv_hal_pyrdown
 #define cv_hal_pyrdown kleidicv_pyrdown_with_fallback
 
+static inline int kleidicv_warp_perspective_with_fallback(
+    int src_type, const uchar *src_data, size_t src_step, int src_width,
+    int src_height, uchar *dst_data, size_t dst_step, int dst_width,
+    int dst_height, const double transformation[9], int interpolation,
+    int borderType, const double borderValue[4]) {
+  return KLEIDICV_HAL_FALLBACK_FORWARD(
+      warp_perspective, cv_hal_warpPerspective, src_type, src_data, src_step,
+      src_width, src_height, dst_data, dst_step, dst_width, dst_height,
+      transformation, interpolation, borderType, borderValue);
+}
+
+#undef cv_hal_warpPerspective
+#define cv_hal_warpPerspective kleidicv_warp_perspective_with_fallback
+
 #endif  // OPENCV_IMGPROC_HAL_REPLACEMENT_HPP
 
 #ifdef OPENCV_CORE_HAL_REPLACEMENT_HPP
diff --git a/conformity/opencv/test_warp_perspective.cpp b/conformity/opencv/test_warp_perspective.cpp
new file mode 100644
index 000000000..775e3d435
--- /dev/null
+++ b/conformity/opencv/test_warp_perspective.cpp
@@ -0,0 +1,75 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "opencv2/core/hal/interface.h"
+#include "opencv2/imgproc/hal/interface.h"
+#include "utils.h"
+
+// clang-format off
+double transform[] = {
+  0.8, 0.1, 2,
+  0.1, 0.8, -2,
+  0.001, 0.001, 1.7
+};
+// clang-format on
+
+// BorderValue is interpreted as 1/1000, i.e. 500 for 0.5
+template <class ScalarType, int Format, int Interpolation, int BorderMode,
+          int BorderValue>
+cv::Mat exec_warp_perspective(cv::Mat& source_mat) {
+  cv::Mat result(source_mat.rows, source_mat.cols, Format);
+  cv::Mat M(3, 3, CV_64FC1, reinterpret_cast<uint8_t*>(transform));
+  cv::warpPerspective(source_mat, result, M, result.size(), Interpolation,
+                      BorderMode, BorderValue / 1000.0);
+  return result;
+}
+
+#if MANAGER
+const int kMaxHeight = 42, kMaxWidth = 42;
+
+template <class ScalarType, int Format, int Interpolation, int BorderMode,
+          int BorderValue>
+bool test_warp_perspective(int index, RecreatedMessageQueue& request_queue,
+                           RecreatedMessageQueue& reply_queue) {
+  for (size_t w = 8; w <= kMaxWidth; w += 3) {
+    for (size_t h = 8; h <= kMaxHeight; h += 4) {
+      cv::Mat source_mat(h, w, Format);
+      for (size_t row = 0; row < h; ++row) {
+        for (size_t column = 0; column < w; ++column) {
+          source_mat.at<ScalarType>(row, column) =
+              (row * (w + 12) + column) %
+              std::numeric_limits<ScalarType>::max();
+        }
+      }
+
+      cv::Mat actual_mat =
+          exec_warp_perspective<ScalarType, Format, Interpolation, BorderMode,
+                                BorderValue>(source_mat);
+      cv::Mat expected_mat = get_expected_from_subordinate(
+          index, request_queue, reply_queue, source_mat);
+
+      bool success =
+          !are_matrices_different<ScalarType>(1, actual_mat, expected_mat);
+      if (!success) {
+        fail_print_matrices(w, h, source_mat, actual_mat, expected_mat);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+#endif
+
+std::vector<test>& warp_perspective_tests_get() {
+  // clang-format off
+  static std::vector<test> tests = {
+    TEST("WarpPerspective uint8", (test_warp_perspective<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>), (exec_warp_perspective<uint8_t, CV_8UC1, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 0>)),
+  };
+  // clang-format on
+  return tests;
+}
diff --git a/conformity/opencv/tests.cpp b/conformity/opencv/tests.cpp
index b2648707f..7bc7a48e6 100644
--- a/conformity/opencv/tests.cpp
+++ b/conformity/opencv/tests.cpp
@@ -40,6 +40,7 @@ std::vector<test> all_tests = merge_tests({
     min_max_tests_get,
     in_range_tests_get,
     remap_tests_get,
+    warp_perspective_tests_get,
     blur_and_downsample_tests_get,
     scharr_interleaved_tests_get,
     // clang-format on
diff --git a/conformity/opencv/tests.h b/conformity/opencv/tests.h
index e148873a2..fc1d85602 100644
--- a/conformity/opencv/tests.h
+++ b/conformity/opencv/tests.h
@@ -23,6 +23,7 @@ std::vector<test>& scale_tests_get();
 std::vector<test>& min_max_tests_get();
 std::vector<test>& in_range_tests_get();
 std::vector<test>& remap_tests_get();
+std::vector<test>& warp_perspective_tests_get();
 std::vector<test>& blur_and_downsample_tests_get();
 std::vector<test>& scharr_interleaved_tests_get();
 
diff --git a/doc/functionality.md b/doc/functionality.md
index 70dec5b55..806c4b786 100644
--- a/doc/functionality.md
+++ b/doc/functionality.md
@@ -96,3 +96,8 @@ See `doc/opencv.md` for details of the functionality available in OpenCV.
 |--------------------------------------------|-----|-----|
 | Remap int16 coordinates                    |  x  |     |
 | Remap int16+uint16 fixed-point coordinates |  x  |     |
+
+# WarpPerspective
+|                                       |  u8 | u16 |
+|---------------------------------------|-----|-----|
+| Nearest neighbour, replicated borders |  x  |     |
diff --git a/doc/opencv.md b/doc/opencv.md
index 0c5e6b789..1c80d62b2 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -203,6 +203,16 @@ Supported map configurations:
 * `map1` is 16SC2 and `map2` is 16UC1: `map1` is as above, `map2` contains combined 5+5 bits of x (low) and y (high) fractions, i.e. x = x1 + x2 / 2^5
   * supported `interpolation`: `INTER_LINEAR` only
 
+### [`cv::warpPerspective()`](https://docs.opencv.org/4.10.0/da/d54/group__imgproc__transform.html#gaf73673a7e8e18ec6963e3774e6a94b87)
+Perspectively transforms the `src` image, using a 3x3 linear transformation matrix on the coordinates.
+
+Notes on parameters:
+* `src.depth()` - only supports `CV_8U` depth and 1 channel.
+* `borderMode` - only supports `BORDER_REPLICATE`
+* `interpolation` - only supports `INTER_NEAREST`
+* `dst_width` - must be at least 8
+* `src_stride` - must fit into uint32
+
 ### [`cv::pyrDown()`](https://docs.opencv.org/4.10.0/d4/d86/group__imgproc__filter.html#gaf9bba239dfca11654cb7f50f889fc2ff)
 Blurs and downsamples an image.
 
diff --git a/kleidicv/include/kleidicv/ctypes.h b/kleidicv/include/kleidicv/ctypes.h
index 0349db0a4..59a177d17 100644
--- a/kleidicv/include/kleidicv/ctypes.h
+++ b/kleidicv/include/kleidicv/ctypes.h
@@ -87,6 +87,14 @@ typedef enum {
   KLEIDICV_BORDER_TYPE_NONE,
 } kleidicv_border_type_t;
 
+/// KleidiCV interpolation types
+typedef enum {
+  /** Nearest neighbour interpolation */
+  KLEIDICV_INTERPOLATION_NEAREST,
+  /** Bilinear interpolation */
+  KLEIDICV_INTERPOLATION_LINEAR,
+} kleidicv_interpolation_type_t;
+
 /// Internal structure where morphology operations store their state
 typedef struct kleidicv_morphology_context_t_ kleidicv_morphology_context_t;
 
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index e76dfc8b7..182729e8a 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1835,6 +1835,46 @@ kleidicv_error_t kleidicv_scharr_interleaved_s16_u8(
     size_t src_channels, int16_t *dst, size_t dst_stride);
 #endif  // DOXYGEN
 
+/// Transforms the `src` image perspectively, using a linear transformation
+/// matrix, i.e. for each pixel in `dst` take a pixel from `src` specified by
+/// the transformed x and y coordinates, and optionally doing a bilinear
+/// interpolation.
+///
+/// `src` and `dst` dimensions may be different. Number of elements is limited
+/// to @ref KLEIDICV_MAX_IMAGE_PIXELS.
+///
+/// @param src            Pointer to the source data. Must be non-null.
+/// @param src_stride     Distance in bytes from the start of one row to the
+///                       start of the next row for the source data. Must
+///                       not be less than `width * sizeof(type)`, except for
+///                       single-row images. Must be less than 2^32.
+/// @param src_width      Number of elements in the source row.
+/// @param src_height     Number of rows in the source data. Must be less than
+///                       2^32.
+/// @param dst            Pointer to the destination data. Must be non-null.
+/// @param dst_stride     Distance in bytes from the start of one row to the
+///                       start of the next row for the destination data.
+///                       Must be a multiple of `sizeof(type)` and no less than
+///                       `width * sizeof(type)`, except for single-row images.
+/// @param dst_width      Number of elements in the destination row. Must be at
+///                       least 8.
+/// @param dst_height     Number of rows in the destination data.
+/// @param transformation Pointer to the transformation matrix of 9 values.
+/// @param channels       Number of channels in the data. Must be 1.
+/// @param interpolation  Interpolation algorithm. Supported types: \n
+///                         - @ref KLEIDICV_INTERPOLATION_NEAREST
+/// @param border_type    Way of handling the border. The supported border types
+///                       are: \n
+///                         - @ref KLEIDICV_BORDER_TYPE_REPLICATE
+/// @param border_values  Border values if the border_type is
+///                       @ref KLEIDICV_BORDER_TYPE_CONSTANT.
+kleidicv_error_t kleidicv_warp_perspective_u8(
+    const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
+    uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
+    const float transformation[9], size_t channels,
+    kleidicv_interpolation_type_t interpolation,
+    kleidicv_border_type_t border_type, kleidicv_border_values_t border_values);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/kleidicv/include/kleidicv/neon.h b/kleidicv/include/kleidicv/neon.h
index 35e42a88c..9fc3974ec 100644
--- a/kleidicv/include/kleidicv/neon.h
+++ b/kleidicv/include/kleidicv/neon.h
@@ -134,14 +134,24 @@ class VectorTypes<uint64_t> {
 };  // end of class VectorTypes<uint64_t>
 
 template <>
-class VectorTypes<float32_t> {
+class VectorTypes<float> {
  public:
-  using ScalarType = float32_t;
+  using ScalarType = float;
   using VectorType = float32x4_t;
   using Vector2Type = float32x4x2_t;
   using Vector3Type = float32x4x3_t;
   using Vector4Type = float32x4x4_t;
-};  // end of class VectorTypes<float32_t>
+};  // end of class VectorTypes<float>
+
+template <>
+class VectorTypes<double> {
+ public:
+  using ScalarType = double;
+  using VectorType = float64x2_t;
+  using Vector2Type = float64x2x2_t;
+  using Vector3Type = float64x2x3_t;
+  using Vector4Type = float64x2x4_t;
+};  // end of class VectorTypes<double>
 
 // Base class for all NEON vector traits.
 template <typename ScalarType>
diff --git a/kleidicv/include/kleidicv/neon_intrinsics.h b/kleidicv/include/kleidicv/neon_intrinsics.h
index df042a7df..b3edbf4ef 100644
--- a/kleidicv/include/kleidicv/neon_intrinsics.h
+++ b/kleidicv/include/kleidicv/neon_intrinsics.h
@@ -67,9 +67,18 @@ NEON_BINARY_OP_Q_B8_B16_B32_B64(vzip2q);
 
 #undef NEON_BINARY_OP_Q_B8_B16_B32_B64
 
-static inline float32x4_t vaddq(float32x4_t lhs, float32x4_t rhs) {
-  return vaddq_f32(lhs, rhs);
-}
+#define NEON_BINARY_OP_Q_F32_F64(name)                               \
+  static inline float32x4_t name(float32x4_t lhs, float32x4_t rhs) { \
+    return name##_f32(lhs, rhs);                                     \
+  }                                                                  \
+                                                                     \
+  static inline float64x2_t name(float64x2_t lhs, float64x2_t rhs) { \
+    return name##_f64(lhs, rhs);                                     \
+  }
+
+NEON_BINARY_OP_Q_F32_F64(vaddq);
+
+#undef NEON_BINARY_OP_Q_F32_F64
 
 // clang-format off
 
diff --git a/kleidicv/include/kleidicv/transform/warp_perspective.h b/kleidicv/include/kleidicv/transform/warp_perspective.h
new file mode 100644
index 000000000..6c3bd0e4c
--- /dev/null
+++ b/kleidicv/include/kleidicv/transform/warp_perspective.h
@@ -0,0 +1,60 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_TRANSFORM_WARP_PERSPECTIVE_H
+#define KLEIDICV_TRANSFORM_WARP_PERSPECTIVE_H
+
+#include <limits>
+#include <type_traits>
+
+#include "kleidicv/ctypes.h"
+
+extern "C" {
+// For internal use only. See kleidicv_warp_perspective_u8 instead.
+// Calculates a stripe of the `dst` image that is a transformed part of `src`.
+// The stripe is defined by the range (y_begin, y_end].
+KLEIDICV_API_DECLARATION(kleidicv_warp_perspective_stripe_u8,
+                         const uint8_t *src, size_t src_stride,
+                         size_t src_width, size_t src_height, uint8_t *dst,
+                         size_t dst_stride, size_t dst_width, size_t dst_height,
+                         size_t y_begin, size_t y_end,
+                         const float transformation[9], size_t channels,
+                         kleidicv_interpolation_type_t interpolation,
+                         kleidicv_border_type_t border_type,
+                         kleidicv_border_values_t border_values);
+}
+
+namespace kleidicv {
+
+template <typename T>
+inline bool warp_perspective_is_implemented(
+    size_t dst_width, kleidicv_interpolation_type_t interpolation,
+    kleidicv_border_type_t border_type,
+    size_t channels) KLEIDICV_STREAMING_COMPATIBLE {
+  if constexpr (std::is_same<T, uint8_t>::value) {
+    return (dst_width >= 8 &&
+            interpolation ==
+                kleidicv_interpolation_type_t::KLEIDICV_INTERPOLATION_NEAREST &&
+            border_type ==
+                kleidicv_border_type_t::KLEIDICV_BORDER_TYPE_REPLICATE &&
+            channels == 1);
+  } else {
+    return false;
+  }
+}
+
+namespace neon {
+
+template <typename T>
+kleidicv_error_t warp_perspective_stripe(
+    const T *src, size_t src_stride, size_t src_width, size_t src_height,
+    T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
+    size_t y_begin, size_t y_end, const float transformation[9],
+    size_t channels, kleidicv_interpolation_type_t interpolation,
+    kleidicv_border_type_t border_type, kleidicv_border_values_t border_values);
+}  // namespace neon
+
+}  // namespace kleidicv
+
+#endif  // KLEIDICV_TRANSFORM_WARP_PERSPECTIVE_H
diff --git a/kleidicv/src/transform/warp_perspective_api.cpp b/kleidicv/src/transform/warp_perspective_api.cpp
new file mode 100644
index 000000000..d37c141fd
--- /dev/null
+++ b/kleidicv/src/transform/warp_perspective_api.cpp
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kleidicv/dispatch.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/transform/warp_perspective.h"
+
+KLEIDICV_MULTIVERSION_C_API(kleidicv_warp_perspective_stripe_u8,
+                            &kleidicv::neon::warp_perspective_stripe<uint8_t>,
+                            nullptr, nullptr);
+
+extern "C" {
+
+kleidicv_error_t kleidicv_warp_perspective_u8(
+    const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
+    uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
+    const float transformation[9], size_t channels,
+    kleidicv_interpolation_type_t interpolation,
+    kleidicv_border_type_t border_type,
+    kleidicv_border_values_t border_values) {
+  if (!kleidicv::warp_perspective_is_implemented<uint8_t>(
+          dst_width, interpolation, border_type, channels)) {
+    return KLEIDICV_ERROR_NOT_IMPLEMENTED;
+  }
+
+  return kleidicv_warp_perspective_stripe_u8(
+      src, src_stride, src_width, src_height, dst, dst_stride, dst_width,
+      dst_height, 0, dst_height, transformation, 1, interpolation, border_type,
+      border_values);
+}
+
+}  // extern "C"
diff --git a/kleidicv/src/transform/warp_perspective_neon.cpp b/kleidicv/src/transform/warp_perspective_neon.cpp
new file mode 100644
index 000000000..702664345
--- /dev/null
+++ b/kleidicv/src/transform/warp_perspective_neon.cpp
@@ -0,0 +1,168 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kleidicv/ctypes.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/neon.h"
+#include "kleidicv/traits.h"
+#include "kleidicv/transform/warp_perspective.h"
+
+static inline uint32_t GetFPCR() {
+  uint64_t r = 0;
+  asm("mrs %0, fpcr" : "=r"(r));
+  return r;
+}
+static inline void SetFPCR(uint32_t r) {
+  asm volatile("msr fpcr, %0" : : "r"(static_cast<uint64_t>(r)));
+}
+
+namespace kleidicv::neon {
+
+// Template for WarpPerspective transformation.
+// Destination pixels are filled from the source, by taking pixels using the
+// transformed coordinates that are calculated as follows:
+//
+//                    [ T0, T1, T2 ]   [ x ]
+//      (x',y',w') =  [ T3, T4, T5 ] * [ y ]
+//                    [ T6, T7, T8 ]   [ 1 ]
+//  then
+//
+//      xt = x' / w'
+//      yt = y' / w'
+//
+//  or putting it together:
+//
+//      xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8)
+//      yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8)
+//
+template <typename ScalarType>
+class WarpPerspective;
+
+template <>
+class WarpPerspective<uint8_t> {
+ public:
+  using ScalarType = uint8_t;
+  using CoordVecTraits = VecTraits<float>;
+  using CoordVector = CoordVecTraits::VectorType;
+
+  WarpPerspective(Rows<const ScalarType> src_rows, size_t src_width,
+                  size_t src_height)
+      : src_rows_{src_rows},
+        v_src_stride_{vdup_n_u32(static_cast<uint32_t>(src_rows_.stride()))},
+        x0123_{vld1q(first_few_x)},
+        v_xmax_{vdupq_n_f32(static_cast<float>(src_width - 1))},
+        v_ymax_{vdupq_n_f32(static_cast<float>(src_height - 1))} {}
+
+  void process_row(size_t y, size_t width, const float transform[9],
+                   Columns<ScalarType> dst) {
+    float dy = static_cast<float>(y);
+    // Calculate half-transformed values at the first pixel (nominators)
+    // tw =  T6*x + T7*y + T8
+    // tx = (T0*x + T1*y + T2) / tw
+    // ty = (T3*x + T4*y + T5) / tw
+    float x0 = transform[1] * dy + transform[2];
+    float y0 = transform[4] * dy + transform[5];
+    float w0 = transform[7] * dy + transform[8];
+    // The next few values can be calculated by adding the corresponding Tn*x
+    CoordVector tx0 =
+        vmlaq_f32(vdupq_n_f32(x0), x0123_, vdupq_n_f32(transform[0]));
+    CoordVector ty0 =
+        vmlaq_f32(vdupq_n_f32(y0), x0123_, vdupq_n_f32(transform[3]));
+    CoordVector tw0 =
+        vmlaq_f32(vdupq_n_f32(w0), x0123_, vdupq_n_f32(transform[6]));
+
+    auto vector_path = [&](size_t x) {
+      float fx = static_cast<float>(x);
+      // Calculate half-transformed values from the first few pixel values, plus
+      // Tn*x, similarly to the one above
+      CoordVector tx = vaddq_f32(tx0, vdupq_n_f32(transform[0] * fx));
+      CoordVector ty = vaddq_f32(ty0, vdupq_n_f32(transform[3] * fx));
+      CoordVector tw = vaddq_f32(tw0, vdupq_n_f32(transform[6] * fx));
+
+      // Calculate inverse weight because division is expensive
+      CoordVector iw = vdivq_f32(vdupq_n_f32(1.F), tw);
+      // Calc and clamp coordinates to within the dimensions of the source image
+      CoordVector xf =
+          vmaxq_f32(vdupq_n_f32(0.F), vminq_f32(vmulq_f32(tx, iw), v_xmax_));
+      CoordVector yf =
+          vmaxq_f32(vdupq_n_f32(0.F), vminq_f32(vmulq_f32(ty, iw), v_ymax_));
+
+      // Calculate offsets from coordinates (y * stride + x)
+      uint32x4_t xi = vcvtnq_u32_f32(xf);
+      uint32x4_t yi = vcvtnq_u32_f32(yf);
+      // To avoid losing precision, the final indices should be in 64 bits
+      uint64x2_t indices_low = vmlal_u32(vmovl_u32(vget_low_u32(xi)),
+                                         vget_low_u32(yi), v_src_stride_);
+      uint64x2_t indices_high =
+          vmlal_u32(vmovl_high_u32(xi), vget_high_u32(yi), v_src_stride_);
+      // Copy pixels from source
+      ptrdiff_t ix = static_cast<ptrdiff_t>(x);
+      dst[ix] = src_rows_[vgetq_lane_u64(indices_low, 0)];
+      dst[ix + 1] = src_rows_[vgetq_lane_u64(indices_low, 1)];
+      dst[ix + 2] = src_rows_[vgetq_lane_u64(indices_high, 0)];
+      dst[ix + 3] = src_rows_[vgetq_lane_u64(indices_high, 1)];
+    };
+
+    LoopUnroll2<TryToAvoidTailLoop> loop{width, CoordVecTraits::num_lanes()};
+    loop.unroll_once(vector_path);
+  }
+
+ private:
+  static constexpr float first_few_x[] = {0.F, 1.F, 2.F, 3.F};
+  Rows<const ScalarType> src_rows_;
+  uint32x2_t v_src_stride_;
+  CoordVector x0123_, v_xmax_, v_ymax_;
+};  // end of class WarpPerspective<uint8_t>
+
+template <typename T>
+kleidicv_error_t warp_perspective_stripe(
+    const T *src, size_t src_stride, size_t src_width, size_t src_height,
+    T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
+    size_t y_begin, size_t y_end, const float transformation[9],
+    size_t channels, kleidicv_interpolation_type_t, kleidicv_border_type_t,
+    kleidicv_border_values_t) {
+  CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
+  CHECK_POINTERS(transformation);
+  CHECK_IMAGE_SIZE(src_width, src_height);
+  CHECK_IMAGE_SIZE(dst_width, dst_height);
+
+  // Calculating in float32_t will only be precise until 24 bits
+  if (src_stride >= (1 << 24) || src_height >= (1 << 24)) {
+    return KLEIDICV_ERROR_RANGE;
+  }
+
+  Rows<const T> src_rows{src, src_stride, channels};
+  Rows<T> dst_rows{dst, dst_stride, channels};
+  WarpPerspective<T> operation{src_rows, src_width, src_height};
+  Rectangle rect{dst_width, dst_height};
+
+  auto original_fpcr = GetFPCR();
+  SetFPCR(original_fpcr & ~(1U << 17U));  // disable DZE, div by zero exception
+
+  dst_rows += y_begin;
+  for (size_t y = y_begin; y < y_end; ++y) {
+    operation.process_row(y, rect.width(), transformation,
+                          dst_rows.as_columns());
+    ++dst_rows;
+  }
+
+  SetFPCR(original_fpcr);
+
+  return KLEIDICV_OK;
+}
+
+#define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(type)                            \
+  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t                           \
+  warp_perspective_stripe<type>(                                               \
+      const type *src, size_t src_stride, size_t src_width, size_t src_height, \
+      type *dst, size_t dst_stride, size_t dst_width, size_t dst_height,       \
+      size_t y_begin, size_t y_end, const float transformation[9],             \
+      size_t channels, kleidicv_interpolation_type_t interpolation,            \
+      kleidicv_border_type_t border_type,                                      \
+      kleidicv_border_values_t border_values)
+
+KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(uint8_t);
+
+}  // namespace kleidicv::neon
diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
index 55d3cff66..3753af477 100644
--- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
+++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
@@ -398,6 +398,18 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u8(
     kleidicv_border_type_t border_type, kleidicv_border_values_t border_values,
     kleidicv_thread_multithreading);
 
+/// Internal - not part of the public API and its direct use is not supported.
+///
+/// Multithreaded implementation of kleidicv_warp_perspective_u8 - see the
+/// documentation of that function for more details.
+kleidicv_error_t kleidicv_thread_warp_perspective_u8(
+    const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
+    uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
+    const float transformation[9], size_t channels,
+    kleidicv_interpolation_type_t interpolation,
+    kleidicv_border_type_t border_type, kleidicv_border_values_t border_values,
+    kleidicv_thread_multithreading);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp
index ca6071134..6097dab4c 100644
--- a/kleidicv_thread/src/kleidicv_thread.cpp
+++ b/kleidicv_thread/src/kleidicv_thread.cpp
@@ -17,6 +17,7 @@
 #include "kleidicv/kleidicv.h"
 #include "kleidicv/remap/remap.h"
 #include "kleidicv/resize/resize_linear.h"
+#include "kleidicv/transform/warp_perspective.h"
 
 typedef std::function<kleidicv_error_t(unsigned, unsigned)> FunctionCallback;
 
@@ -676,3 +677,24 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u8(
   };
   return parallel_batches(callback, mt, dst_height);
 }
+
+kleidicv_error_t kleidicv_thread_warp_perspective_u8(
+    const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
+    uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
+    const float transformation[9], size_t channels,
+    kleidicv_interpolation_type_t interpolation,
+    kleidicv_border_type_t border_type, kleidicv_border_values_t border_values,
+    kleidicv_thread_multithreading mt) {
+  if (!kleidicv::warp_perspective_is_implemented<uint8_t>(
+          dst_width, interpolation, border_type, channels)) {
+    return KLEIDICV_ERROR_NOT_IMPLEMENTED;
+  }
+
+  auto callback = [=](unsigned y_begin, unsigned y_end) {
+    return kleidicv_warp_perspective_stripe_u8(
+        src, src_stride, src_width, src_height, dst, dst_stride, dst_width,
+        dst_height, y_begin, std::min<size_t>(dst_height, y_end + 1),
+        transformation, channels, interpolation, border_type, border_values);
+  };
+  return parallel_batches(callback, mt, dst_height);
+}
diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt
index dfc095385..f173b8ba8 100755
--- a/scripts/benchmark/benchmarks.txt
+++ b/scripts/benchmark/benchmarks.txt
@@ -78,6 +78,8 @@ InRange_F32: opencv_perf_core '*inRangeScalar/*' '($PIXEL_FORMAT, 32FC1, 1, 2)'
 
 Remap_S16_U8: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)'
 Remap_S16Point5_U8: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)'
+WarpPerspective: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)'
+WarpPerspectiveNear: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)'
 
 BlurAndDownsample: opencv_perf_imgproc '*pyrDown/*' '($PIXEL_FORMAT, 8UC1)'
 
diff --git a/scripts/ci-opencv.sh b/scripts/ci-opencv.sh
index d0a7dc004..8b0df31b8 100755
--- a/scripts/ci-opencv.sh
+++ b/scripts/ci-opencv.sh
@@ -102,6 +102,8 @@ IMGPROC_TEST_PATTERNS=(
   '*Imgproc_Dilate*'
   '*Imgproc_Erode*'
   '*Imgproc_PyramidDown*'
+  '*Imgproc_Remap*'
+  '*Imgproc_Warp*'
 )
 IMGPROC_TEST_PATTERNS_STR="$(join_strings_with_colon "${IMGPROC_TEST_PATTERNS[*]}")"
 ../../../conformity/opencv_kleidicv/bin/opencv_test_imgproc \
diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp
index 48af96271..e4cb0b6e3 100644
--- a/test/api/test_thread.cpp
+++ b/test/api/test_thread.cpp
@@ -242,6 +242,74 @@ class Thread : public testing::TestWithParam<P> {
 
     EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, result);
   }
+
+  template <typename T, typename SingleThreadedFunc, typename MultithreadedFunc,
+            typename... Args>
+  void check_warp_perspective(SingleThreadedFunc single_threaded_func,
+                              MultithreadedFunc multithreaded_func,
+                              size_t channels, Args... args) {
+    unsigned test_width = 0, height = 0, thread_count = 0;
+    std::tie(test_width, height, thread_count) = GetParam();
+    const unsigned src_width = 300, src_height = 300;
+    // width < 8 are not supported, that's not tested here
+    size_t width = test_width + 8;
+    test::Array2D<T> src(size_t{src_width} * channels, src_height);
+    test::Array2D<T> dst_single(width * channels, height),
+        dst_multi(width * channels, height);
+    // clang-format off
+    const float transform[] = {
+      0.8, 0.1, 2,
+      0.1, 0.8, -2,
+      0.001, 0.001, 1.0
+    };
+    // clang-format on
+    kleidicv_error_t single_result = single_threaded_func(
+        src.data(), src.stride(), src_width, src_height, dst_single.data(),
+        dst_single.stride(), width, height, transform, channels, args...);
+
+    kleidicv_error_t multi_result = multithreaded_func(
+        src.data(), src.stride(), src_width, src_height, dst_multi.data(),
+        dst_multi.stride(), width, height, transform, channels, args...,
+        get_multithreading_fake(thread_count));
+
+    EXPECT_EQ(KLEIDICV_OK, single_result);
+    EXPECT_EQ(KLEIDICV_OK, multi_result);
+    EXPECT_EQ_ARRAY2D(dst_multi, dst_single);
+  }
+
+  template <typename T, typename MultithreadedFunc, typename... Args>
+  void check_warp_perspective_not_implemented(
+      MultithreadedFunc multithreaded_func, size_t channels, Args... args) {
+    unsigned test_width = 0, height = 0, thread_count = 0;
+    std::tie(test_width, height, thread_count) = GetParam();
+    const unsigned src_width = 300, src_height = 300;
+    // width < 8 are not supported!
+    size_t width = test_width + 8;
+    test::Array2D<T> src(size_t{src_width} * channels, src_height);
+    test::Array2D<T> dst_small(test_width * channels, height),
+        dst(width * channels, height);
+    // clang-format off
+    const float transform[] = {
+      0.8, 0.1, 2,
+      0.1, 0.8, -2,
+      0.001, 0.001, 1.0
+    };
+    // clang-format on
+
+    kleidicv_error_t result = multithreaded_func(
+        src.data(), src.stride(), src_width, src_height, dst.data(),
+        dst.stride(), width, height, transform, channels, args...,
+        get_multithreading_fake(thread_count));
+
+    EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, result);
+
+    result = multithreaded_func(src.data(), src.stride(), src_width, src_height,
+                                dst_small.data(), dst_small.stride(),
+                                test_width, height, transform, channels,
+                                args..., get_multithreading_fake(thread_count));
+
+    EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, result);
+  }
 };
 
 #define TEST_UNARY_OP(suffix, SrcT, DstT, ...)                              \
@@ -557,6 +625,25 @@ TEST_P(Thread, remap_s16point5_u8_not_implemented) {
       kleidicv_border_values_t{});
 }
 
+TEST_P(Thread, warp_perspective_u8_border_replicate) {
+  check_warp_perspective<uint8_t>(
+      kleidicv_warp_perspective_u8, kleidicv_thread_warp_perspective_u8, 1,
+      KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
+      kleidicv_border_values_t{});
+}
+
+TEST_P(Thread, warp_perspective_u8_not_implemented) {
+  check_warp_perspective_not_implemented<uint8_t>(
+      kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_LINEAR,
+      KLEIDICV_BORDER_TYPE_REPLICATE, kleidicv_border_values_t{});
+  check_warp_perspective_not_implemented<uint8_t>(
+      kleidicv_thread_warp_perspective_u8, 2, KLEIDICV_INTERPOLATION_NEAREST,
+      KLEIDICV_BORDER_TYPE_REPLICATE, kleidicv_border_values_t{});
+  check_warp_perspective_not_implemented<uint8_t>(
+      kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_NEAREST,
+      KLEIDICV_BORDER_TYPE_CONSTANT, kleidicv_border_values_t{});
+}
+
 TEST_P(Thread, SobelHorizontal1Channel) {
   check_unary_op<uint8_t, int16_t>(kleidicv_sobel_3x3_horizontal_s16_u8,
                                    kleidicv_thread_sobel_3x3_horizontal_s16_u8,
diff --git a/test/api/test_warp_perspective.cpp b/test/api/test_warp_perspective.cpp
new file mode 100644
index 000000000..b4b7d62c8
--- /dev/null
+++ b/test/api/test_warp_perspective.cpp
@@ -0,0 +1,305 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include "framework/array.h"
+#include "framework/generator.h"
+#include "framework/utils.h"
+#include "kleidicv/ctypes.h"
+#include "kleidicv/kleidicv.h"
+
+// clang-format off
+static const float transform_identity[] = {
+  1.0, 0, 0,
+  0, 1.0, 0,
+  0, 0, 1.0
+};
+
+static const float transform_transpose[] = {
+  0, 1.0, 0,
+  1.0, 0, 0,
+  0, 0, 1.0
+};
+
+static const float transform_small[] = {
+  0.8, 0.1, 2,
+  0.1, 0.8, -2,
+  0.001, 0.001, 1.7
+};
+// clang-format on
+
+template <class ScalarType>
+static void random_initializer(test::Array2D<ScalarType> &source) {
+  test::PseudoRandomNumberGenerator<ScalarType> generator;
+  source.fill(generator);
+}
+
+template <class ScalarType>
+class WarpPerspective : public testing::Test {
+ public:
+  static void test(
+      size_t src_w, size_t src_h, size_t dst_w, size_t dst_h,
+      const float transform[9], size_t channels, size_t padding,
+      void (*initializer)(test::Array2D<ScalarType> &) = random_initializer) {
+    size_t src_total_width = channels * src_w;
+    size_t dst_total_width = channels * dst_w;
+
+    test::Array2D<ScalarType> source{src_total_width, src_h, padding, channels};
+    test::Array2D<ScalarType> actual{dst_total_width, dst_h, padding, channels};
+    test::Array2D<ScalarType> expected{dst_total_width, dst_h, padding,
+                                       channels};
+
+    initializer(source);
+    actual.fill(42);
+
+    calculate_expected(source, transform, expected);
+
+    ASSERT_EQ(KLEIDICV_OK, kleidicv_warp_perspective_u8(
+                               source.data(), source.stride(), source.width(),
+                               source.height(), actual.data(), actual.stride(),
+                               actual.width(), actual.height(), transform,
+                               channels, KLEIDICV_INTERPOLATION_NEAREST,
+                               KLEIDICV_BORDER_TYPE_REPLICATE, {}));
+
+    EXPECT_EQ_ARRAY2D(actual, expected);
+  }
+
+ private:
+  static void calculate_expected(test::Array2D<ScalarType> &src,
+                                 const float transform[9],
+                                 test::Array2D<ScalarType> &expected) {
+    for (size_t y = 0; y < expected.height(); ++y) {
+      for (size_t x = 0; x < expected.width() / src.channels(); ++x) {
+        double dx = static_cast<double>(x), dy = static_cast<double>(y);
+
+        double dw = transform[6] * dx + transform[7] * dy + transform[8];
+        double inv_w = dw == 0.0 ? 0.0 : 1.0 / dw;
+        double fx =
+            inv_w * (transform[0] * dx + transform[1] * dy + transform[2]);
+        double fy =
+            inv_w * (transform[3] * dx + transform[4] * dy + transform[5]);
+        ptrdiff_t ix = static_cast<ptrdiff_t>(
+            std::max<double>(0, std::min(fx + 0.5, src.width() - 1.0)));
+        ptrdiff_t iy = static_cast<ptrdiff_t>(
+            std::max<double>(0, std::min(fy + 0.5, src.height() - 1.0)));
+        for (size_t ch = 0; ch < src.channels(); ++ch) {
+          *expected.at(y, x * src.channels() + ch) =
+              *src.at(iy, ix * src.channels() + ch);
+        }
+      }
+    }
+  }
+};
+
+using WarpPerspectiveElementTypes = ::testing::Types<uint8_t>;
+TYPED_TEST_SUITE(WarpPerspective, WarpPerspectiveElementTypes);
+
+TYPED_TEST(WarpPerspective, IdentityNoPadding) {
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1, 0);
+}
+
+TYPED_TEST(WarpPerspective, TransposeNoPadding) {
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1, 0);
+}
+
+TYPED_TEST(WarpPerspective, SmallPadding) {
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1, 13);
+}
+
+TYPED_TEST(WarpPerspective, Upscale) {
+  // clang-format off
+  const float transform_upscale[] = {
+    0.2, 0.05, 0.1,
+    0.02, 0.2, -0.1,
+    0.0001, 0.0001, 1.1
+  };
+  // clang-format on
+
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w * 3;
+  size_t dst_h = src_h * 2;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3);
+}
+
+TYPED_TEST(WarpPerspective, RandomTransform) {
+  float transform[9];
+  test::PseudoRandomNumberGenerator<float> generator;
+  for (size_t i = 0; i < 9; ++i) {
+    transform[i] = generator.next().value_or(1.0);
+  }
+
+  size_t src_w = 3 * test::Options::vector_lanes<TypeParam>() - 1;
+  size_t src_h = 4;
+  size_t dst_w = src_w;
+  size_t dst_h = src_h;
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, 19);
+}
+
+static const size_t big_width = 1 << 17, big_height = 1 << 17;
+static const size_t part_width = 16, part_height = 16;
+
+template <class ScalarType>
+static void part_initializer(test::Array2D<ScalarType> &source) {
+  ScalarType counter = 0;
+  for (size_t y = big_height; y < big_height + part_height; ++y) {
+    for (size_t x = big_width; x < big_width + part_width; ++x) {
+      *source.at(y, x) = ++counter;
+    }
+  }
+}
+
+TYPED_TEST(WarpPerspective, BigSourceImage) {
+  // clang-format off
+  const float transform_cut[] = {
+    1, 0, 1<<17,
+    0, 1, 1<<17,
+    0, 0, 1
+  };
+  // clang-format on
+
+  // Let stride * height be bigger than 1 << 32
+  size_t dst_w = part_width;
+  size_t dst_h = part_height;
+  size_t src_w = big_width + part_width;
+  size_t src_h = big_height + part_height;
+
+  TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, 0,
+                    part_initializer);
+}
+
+TYPED_TEST(WarpPerspective, NullPointer) {
+  const TypeParam src[4] = {};
+  TypeParam dst[8];
+  test::test_null_args(kleidicv_warp_perspective_u8, src, 2, 2, 2, dst, 8, 8, 1,
+                       transform_identity, 1, KLEIDICV_INTERPOLATION_NEAREST,
+                       KLEIDICV_BORDER_TYPE_REPLICATE,
+                       kleidicv_border_values_t{});
+}
+
+TYPED_TEST(WarpPerspective, ZeroImageSize) {
+  const TypeParam src[1] = {};
+  TypeParam dst[1];
+
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            kleidicv_warp_perspective_u8(
+                src, 1, 0, 1, dst, 1, 0, 1, transform_identity, 1,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
+                kleidicv_border_values_t{}));
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            kleidicv_warp_perspective_u8(
+                src, 1, 1, 0, dst, 1, 1, 0, transform_identity, 1,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
+                kleidicv_border_values_t{}));
+}
+
+TYPED_TEST(WarpPerspective, InvalidImageSize) {
+  const TypeParam src[1] = {};
+  TypeParam dst[8];
+
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            kleidicv_warp_perspective_u8(
+                src, 1, KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, dst, 8, 8, 1,
+                transform_identity, 1, KLEIDICV_INTERPOLATION_NEAREST,
+                KLEIDICV_BORDER_TYPE_REPLICATE, kleidicv_border_values_t{}));
+
+  EXPECT_EQ(
+      KLEIDICV_ERROR_RANGE,
+      kleidicv_warp_perspective_u8(
+          src, 1, KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, dst, 8,
+          8, 1, transform_identity, 1, KLEIDICV_INTERPOLATION_NEAREST,
+          KLEIDICV_BORDER_TYPE_REPLICATE, kleidicv_border_values_t{}));
+
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            kleidicv_warp_perspective_u8(
+                src, 1, 1, 1, dst, 1, KLEIDICV_MAX_IMAGE_PIXELS + 1, 1,
+                transform_identity, 1, KLEIDICV_INTERPOLATION_NEAREST,
+                KLEIDICV_BORDER_TYPE_REPLICATE, kleidicv_border_values_t{}));
+
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            kleidicv_warp_perspective_u8(
+                src, 1, 1, 1, dst, 1, KLEIDICV_MAX_IMAGE_PIXELS,
+                KLEIDICV_MAX_IMAGE_PIXELS, transform_identity, 1,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
+                kleidicv_border_values_t{}));
+}
+
+TYPED_TEST(WarpPerspective, UnsupportedTwoChannels) {
+  const TypeParam src[1] = {};
+  TypeParam dst[8];
+
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            kleidicv_warp_perspective_u8(
+                src, 1, 1, 1, dst, 8, 8, 1, transform_identity, 2,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
+                kleidicv_border_values_t{}));
+}
+
+TYPED_TEST(WarpPerspective, UnsupportedBorderTypeConst) {
+  const TypeParam src[1] = {};
+  TypeParam dst[8];
+
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            kleidicv_warp_perspective_u8(
+                src, 1, 1, 1, dst, 8, 8, 1, transform_identity, 1,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_CONSTANT,
+                kleidicv_border_values_t{}));
+}
+
+TYPED_TEST(WarpPerspective, UnsupportedTooSmallImage) {
+  const TypeParam src[1] = {};
+  TypeParam dst[8];
+
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            kleidicv_warp_perspective_u8(
+                src, 1, 1, 1, dst, 8, 7, 1, transform_identity, 1,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
+                kleidicv_border_values_t{}));
+}
+
+TYPED_TEST(WarpPerspective, UnsupportedInterpolation) {
+  const TypeParam src[1] = {};
+  TypeParam dst[8];
+
+  EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            kleidicv_warp_perspective_u8(
+                src, 1, 1, 1, dst, 8, 8, 1, transform_identity, 1,
+                KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE,
+                kleidicv_border_values_t{}));
+}
+
+TYPED_TEST(WarpPerspective, UnsupportedBigStride) {
+  const TypeParam src[1] = {};
+  TypeParam dst[8];
+
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            kleidicv_warp_perspective_u8(
+                src, 1UL << 24, 1, 1, dst, 8, 8, 1, transform_identity, 1,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
+                kleidicv_border_values_t{}));
+}
+
+TYPED_TEST(WarpPerspective, UnsupportedBigHeight) {
+  const TypeParam src[1] = {};
+  TypeParam dst[8];
+
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            kleidicv_warp_perspective_u8(
+                src, 1, 1, 1UL << 24, dst, 8, 8, 1, transform_identity, 1,
+                KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE,
+                kleidicv_border_values_t{}));
+}
-- 
GitLab